i finished creating a design in vhdl, of the algorithm sha256. now im trying to get my design level higher by understanding how to change the code so i will get higher result of power, performance and area. the end game goal is trying to get the best netlist in my designs so i can get them into a chip.
so for my design: i got max frequency of 85 mhz in cyclone 4 FPGA with the usage of 8,500 total logic elements, 55% of the FPGA.
the main issue that i think made my design so big is that i wrote the code in a hierarchy manner, a lot of "elsif" and variables. and one other thing that could be better, i think, is if the quartus would implement my memory design as a memory and not with logic element, even that its only array of 16 words of 32 bits. what you guys think i can improve ?
library ieee;
use ieee.std_logic_1164.all;
use ieee.std_logic_unsigned.all;
USE ieee.numeric_std.ALL;
entity padding is
port( clk : in std_logic;
rst : in std_logic;
ward : in std_logic_vector(31 downto 0);
ready : out std_logic;
hash : out std_logic_vector(255 downto 0));
end;
architecture padding of padding is
component sha256
port ( clk : in std_logic;
rst : in std_logic;
enable : in std_logic;
ward : in std_logic_vector(31 downto 0);
k : in std_logic_vector(31 downto 0);
h0 : in std_logic_vector(31 downto 0);
h1 : in std_logic_vector(31 downto 0);
h2 : in std_logic_vector(31 downto 0);
h3 : in std_logic_vector(31 downto 0);
h4 : in std_logic_vector(31 downto 0);
h5 : in std_logic_vector(31 downto 0);
h6 : in std_logic_vector(31 downto 0);
h7 : in std_logic_vector(31 downto 0);
ready : out std_logic;
digest : out std_logic_vector(255 downto 0));
end component;
type kconst is array ( 0 to 63 ) of std_logic_vector(31 downto 0);
type mem is array ( 0 to 15 ) of std_logic_vector(31 downto 0);
signal k : kconst := (x"428a2f98", x"71374491", x"b5c0fbcf", x"e9b5dba5", x"3956c25b", x"59f111f1", x"923f82a4", x"ab1c5ed5",
x"d807aa98", x"12835b01", x"243185be", x"550c7dc3", x"72be5d74", x"80deb1fe", x"9bdc06a7", x"c19bf174",
x"e49b69c1", x"efbe4786", x"0fc19dc6", x"240ca1cc", x"2de92c6f", x"4a7484aa", x"5cb0a9dc", x"76f988da",
x"983e5152", x"a831c66d", x"b00327c8", x"bf597fc7", x"c6e00bf3", x"d5a79147", x"06ca6351", x"14292967",
x"27b70a85", x"2e1b2138", x"4d2c6dfc", x"53380d13", x"650a7354", x"766a0abb", x"81c2c92e", x"92722c85",
x"a2bfe8a1", x"a81a664b", x"c24b8b70", x"c76c51a3", x"d192e819", x"d6990624", x"f40e3585", x"106aa070",
x"19a4c116", x"1e376c08", x"2748774c", x"34b0bcb5", x"391c0cb3", x"4ed8aa4a", x"5b9cca4f", x"682e6ff3",
x"748f82ee", x"78a5636f", x"84c87814", x"8cc70208", x"90befffa", x"a4506ceb", x"bef9a3f7", x"c67178f2");
signal first_mem : mem:= ( x"00000000", x"00000000", x"00000000", x"00000000", x"00000000", x"00000000", x"00000000", x"00000000",
x"00000000", x"00000000", x"00000000", x"00000000", x"00000000", x"00000000", x"00000000", x"00000000");
signal second_mem : mem:= ( x"00000000", x"00000000", x"00000000", x"00000000", x"80000000", x"00000000", x"00000000", x"00000000",
x"00000000", x"00000000", x"00000000", x"00000000", x"00000000", x"00000000", x"00000000", x"00000280");
signal enable : std_logic;
signal enable1 : std_logic;
signal enable2 : std_logic;
signal r_d : std_logic;
signal k_in : std_logic_vector(31 downto 0);
signal ward_in : std_logic_vector(31 downto 0);
signal ward_in1 : std_logic_vector(31 downto 0);
signal ward_in2 : std_logic_vector(31 downto 0);
signal h0,h1,h2,h3 : std_logic_vector(31 downto 0);
signal h4,h5,h6,h7 : std_logic_vector(31 downto 0);
signal temp : std_logic_vector(255 downto 0);
signal temp1 : std_logic_vector(255 downto 0);
signal gama0 : std_logic_vector(31 downto 0);
signal gama1 : std_logic_vector(31 downto 0);
signal gama2 : std_logic_vector(31 downto 0);
signal gama3 : std_logic_vector(31 downto 0);
signal gama4 : std_logic_vector(31 downto 0);
signal gama5 : std_logic_vector(31 downto 0);
begin
sha1: sha256 port map(
clk ,
rst ,
enable ,
ward_in ,
k_in ,
h0 ,
h1 ,
h2 ,
h3 ,
h4 ,
h5 ,
h6 ,
h7 ,
enable1 ,
temp );
sha2: sha256 port map(
clk ,
rst ,
enable1 ,
ward_in1 ,
k_in ,
temp(255 downto 224),
temp(223 downto 192),
temp(191 downto 160),
temp(159 downto 128),
temp(127 downto 96 ),
temp(95 downto 64 ),
temp(63 downto 32 ),
temp(31 downto 0 ),
enable2 ,
temp1 );
sha3: sha256 port map(
clk ,
rst ,
r_d ,
ward_in2 ,
k_in ,
h0 ,
h1 ,
h2 ,
h3 ,
h4 ,
h5 ,
h6 ,
h7 ,
ready ,
hash );
h0 <= x"6a09e667";
h1 <= x"bb67ae85";
h2 <= x"3c6ef372";
h3 <= x"a54ff53a";
h4 <= x"510e527f";
h5 <= x"9b05688c";
h6 <= x"1f83d9ab";
h7 <= x"5be0cd19";
process (clk,rst)
variable i : integer;
variable j : integer;
variable m : integer;
variable n : integer;
variable l : integer;
begin
if rst = '0' then
enable <= '0';
i := 0;
j := 0;
m := 9;
n := 15;
l := 8;
elsif clk'event and clk = '1' then
if j = 16 then
j := 0;
end if;
if m = 16 then
m := 0;
end if;
if n = 16 then
n := 0;
end if;
if l = 16 then
l := 0;
end if;
if i = 193 then
i := 0;
elsif i > 144 then
first_mem(n) <= gama4 + first_mem(l) + gama5 + first_mem(n);
ward_in2 <= gama4 + first_mem(l) + gama5 + first_mem(n);
k_in <= k(i-129);
elsif i > 136 then
ward_in2 <= first_mem(n);
k_in <= k(i-129);
elsif i = 136 then
first_mem(n) <= temp1(31 downto 0);
ward_in2 <= temp1(31 downto 0);
k_in <= k(i-129);
elsif i = 135 then
first_mem(n) <= temp1(63 downto 32);
ward_in2 <= temp1(63 downto 32);
k_in <= k(i-129);
elsif i = 134 then
first_mem(n) <= temp1(95 downto 64);
ward_in2 <= temp1(95 downto 64);
k_in <= k(i-129);
elsif i = 133 then
first_mem(n) <= temp1(127 downto 96);
ward_in2 <= temp1(127 downto 96);
k_in <= k(i-129);
elsif i = 132 then
first_mem(n) <= temp1(159 downto 128);
ward_in2 <= temp1(159 downto 128);
k_in <= k(i-129);
elsif i = 131 then
first_mem(n) <= temp1(191 downto 160);
ward_in2 <= temp1(191 downto 160);
k_in <= k(i-129);
elsif i = 130 then
first_mem(n) <= temp1(223 downto 192);
ward_in2 <= temp1(223 downto 192);
k_in <= k(i-129);
elsif i = 129 then
first_mem(15) <= x"00000100";
first_mem(14) <= x"00000000";
first_mem(13) <= x"00000000";
first_mem(12) <= x"00000000";
first_mem(11) <= x"00000000";
first_mem(10) <= x"00000000";
first_mem(9) <= x"00000000";
first_mem(8) <= x"80000000";
first_mem(n) <= temp1(255 downto 224);
ward_in2 <= temp1(255 downto 224);
k_in <= k(i-129);
elsif i = 128 then
elsif i > 79 then
second_mem(j) <= gama2 + second_mem(m) + gama3 + second_mem(j);
ward_in1 <= gama2 + second_mem(m) + gama3 + second_mem(j);
k_in <= k(i-64);
elsif i > 63 then
enable <= '0';
ward_in1 <= second_mem(j);
k_in <= k(i-64);
elsif i > 19 then
first_mem(j) <= gama0 + first_mem(m) + gama1 + first_mem(j);
ward_in <= gama0 + first_mem(m) + gama1 + first_mem(j);
k_in <= k(i);
enable <= '1';
elsif i > 15 then
second_mem(j)<= ward;
first_mem(j) <= gama0 + first_mem(m) + gama1 + first_mem(j);
ward_in <= gama0 + first_mem(m) + gama1 + first_mem(j);
k_in <= k(i);
enable <= '1';
elsif i >= 0 then
first_mem(i) <= ward;
ward_in <= ward;
k_in <= k(i);
enable <= '1';
end if;
i := i + 1;
j := j + 1;
m := m + 1;
n := n + 1;
l := l + 1;
end if;
end process;
process (clk, rst)
begin
if rst = '0' then
r_d <= '0';
elsif clk'event and clk = '1' then
r_d <= enable2;
end if;
end process;
process (clk, rst)
variable f: integer;
variable j: integer;
variable l: integer;
variable m: integer;
begin
if rst = '0' then
f := 2;
j := 15;
l := 1;
m := 14;
elsif clk'event and clk = '1' then
if j = 16 then
j := 0;
end if;
if f = 16 then
f := 0;
end if;
if l = 16 then
l := 0;
end if;
if m = 16 then
m := 0;
end if;
gama0 <= ((first_mem(f)(6 downto 0) & first_mem(f)(31 downto 7)) xor (first_mem(f)(17 downto 0) & first_mem(f)(31 downto 18)) xor ("000" & first_mem(f)(31 downto 3)));
gama1 <= ((first_mem(j)(16 downto 0) & first_mem(j)(31 downto 17)) xor (first_mem(j)(18 downto 0) & first_mem(j)(31 downto 19)) xor ("0000000000" & first_mem(j)(31 downto 10)));
gama4 <= ((first_mem(l)(6 downto 0) & first_mem(l)(31 downto 7)) xor (first_mem(l)(17 downto 0) & first_mem(l)(31 downto 18)) xor ("000" & first_mem(l)(31 downto 3)));
gama5 <= ((first_mem(m)(16 downto 0) & first_mem(m)(31 downto 17)) xor (first_mem(m)(18 downto 0) & first_mem(m)(31 downto 19)) xor ("0000000000" & first_mem(m)(31 downto 10)));
gama2 <= ((second_mem(f)(6 downto 0) & second_mem(f)(31 downto 7)) xor (second_mem(f)(17 downto 0) & second_mem(f)(31 downto 18)) xor ("000" & second_mem(f)(31 downto 3)));
gama3 <= ((second_mem(j)(16 downto 0) & second_mem(j)(31 downto 17)) xor (second_mem(j)(18 downto 0) & second_mem(j)(31 downto 19)) xor ("0000000000" & second_mem(j)(31 downto 10)));
f := f + 1;
j := j + 1;
l := l + 1;
m := m + 1;
end if;
end process;
end;
elsif, i.e. "priority en/decoding", will effect the frequency of your design. With all the available logic resources you have left, you might consider a case statement... unless you actually have a need for priority en/decoding. Even then, if you can afford latency trade-offs you can do the decoding in several clock cycles (pipeline the decode) and your design will likely increase frequency... ultimately, you need to run a timing report and look at the slow paths to understand the bottlenecks.
If you really want to use RAM instead of FFs, you could infer a RAM (create an array), or if that doesn't work for you, you could then manually instantiate a device-specific RAM.... and, then, of course, add the control logic for it. if primitive, blackbox it to swap later for "same" ASIC library primitive
As far as "variables", the discussion is the same as "VHDL" vs. "Verilog", or "synch" vs "asynch" resets, mostly just opinions, and mine is, "I am not a fan of variables in synthesizable RTL"... they are legal for synthesis, but they "disappear" during syn, so that if you ever want to look at a netlist and compare to your RTL, you get to manually trace the connections. There is usually not a good reason to have variables, as they represent nothing in terms of Hardware, and obfuscate the design vs. netlist. I like to see logic types of wire/net/regs, so that it is clear what you are creating in HW. But, as you please, i just tend to cringe when i see them.
Along the same lines, in terms of arrays, I am not a big fan of "bundling signals into arrays"... folks will argue it is "faster" and "easier" to deal with, but to me, it further obfuscates the design. Again, not illegal, but when it comes to OPC (Other people's code) it can be very annoying trying to trace signals, not only within a module, but, arrays across ports too... and then, if they slice those arrays, or decimate them otherwise, it gets even more annoying. Kind of like this rant :)
Ultimately, you can do whatever you want, and especially in an FPGA, some folks tend to be less attention-to-detail about what will be created versus an ASIC. If you are designing an ASIC, I would say you should err on the side of being more pedantic and be able to look at your RTL and know what (to some extent) is going to be created and as such be able to estimate the gate count if you need. To this end, I highly recommend taking the time to draw your design in a drawing program (e.g. visio) to include gates, FFs, decoders, Muxes, FSMs, pseudo-code where appropriate, details of your clock and reset trees, and all CDC crossing logic, etc., including signal names. Once you have that, it is just a matter of translating to RTL... likely, too, as a bonus for those that share my opinion on variables, you will find no variables in your drawing and thus none in your RTL. :)