I am writing a VHDL module to convert an incoming stream via axi stream (tdata, tvalid, tready and tlast) with tdata's with 8 bits such that the fist 4 bytes are registered in the output port A of 32 bits and the consecutive 4 bytes out to port B also of 32 bits. Not that, but the goal is to have valid and ready for A and B such that valid is set when all 4 bytes have been registered and it holds their values until ready is 1 (indicating that has been read, by the following module or an HLS IP).
For that, I wrote a state machine with two states that go back and forth. I also wrote a vhdl module to read hex numbers from a .txt file and converts it to the axi stream.
The issue is that the post-synthesis simulation behaves as expected (sort of). I would have expected, as tdata is registered, that would imply having a one-clock cycle delay, which is not in the post synthesis simulation but it is in the behavioral simulation. There is an issue there, because in the behavioral simulation, the output of A and B is not as in the post-synthesis simulation (which is correct. A=ddccbbaa and B=44332211).
Here is the post-synthesis simulation:
Here is the behavioral simulation:
The code is:
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
entity a2m is
port (
-- AXI Stream Interface
clk : in std_logic;
reset : in std_logic;
tdata : in std_logic_vector(7 downto 0);
tvalid : in std_logic;
tready : out std_logic;
tlast : in std_logic;
test_port : out std_logic_vector(7 downto 0);
-- Port A and B
A : out std_logic_vector(31 downto 0);
A_valid : out std_logic;
A_ready : in std_logic;
B : out std_logic_vector(31 downto 0);
B_valid : out std_logic;
B_ready : in std_logic
);
end a2m;
architecture fsm of a2m is
-- State Machine States
type state_type is (RECEIVE_A, RECEIVE_B);
signal state : state_type := RECEIVE_A;
signal test_port_reg : std_logic_vector(7 downto 0) := (others => '0');
-- Internal data storage
signal A_reg : std_logic_vector(31 downto 0) := (others => '0');
signal B_reg : std_logic_vector(31 downto 0) := (others => '0');
signal a_count : integer range 0 to 3 := 0;
signal a_count_en : std_logic := '1';
signal b_count : integer range 0 to 3 := 0;
-- Valid signals
signal A_valid_reg : std_logic := '0';
signal A_ready_reg : std_logic := '0';
signal B_valid_reg : std_logic := '0';
signal B_ready_reg : std_logic := '0';
begin
-- State Machine Process
process(clk, reset)
begin
if rising_edge(clk) then
if reset = '1' then
-- Reset all registers and state machine
state <= RECEIVE_A;
A_reg <= (others => '0');
B_reg <= (others => '0');
A_valid_reg <= '0';
B_valid_reg <= '0';
a_count <= 0;
b_count <= 0;
else
test_port_reg <= tdata;
case state is
when RECEIVE_A =>
if tvalid = '1' then
case a_count is
when 0 =>
A_reg(7 downto 0) <= tdata;
when 1 =>
A_reg(15 downto 8) <= tdata;
when 2 =>
A_reg(23 downto 16) <= tdata;
when 3 =>
A_reg(31 downto 24) <= tdata;
A_valid_reg <= '1'; -- All 4 bytes for A received
when others =>
null;
end case;
if(a_count<3) then
a_count <= a_count + 1;
end if;
end if;
-- Hold until A_ready is asserted
if A_valid_reg = '1' and A_ready = '1' then
A_valid_reg <= '0'; -- Clear A_valid when ready is high
a_count <= 0; -- Reset a_count for B reception
state <= RECEIVE_B; -- Move to receive B
end if;
when RECEIVE_B =>
if tvalid = '1' then
case b_count is
when 0 =>
B_reg(7 downto 0) <= tdata;
when 1 =>
B_reg(15 downto 8) <= tdata;
when 2 =>
B_reg(23 downto 16) <= tdata;
when 3 =>
B_reg(31 downto 24) <= tdata;
B_valid_reg <= '1'; -- All 4 bytes for B received
when others =>
null;
end case;
if(b_count<3) then
b_count <= b_count + 1;
end if;
end if;
-- Hold until B_ready is asserted
if B_valid_reg = '1' and B_ready = '1' then
B_valid_reg <= '0'; -- Clear B_valid when ready is high
b_count <= 0;
state <= RECEIVE_A; -- Return to RECEIVE_A for the next frame
end if;
end case;
end if;
end if;
end process;
-- Output assignments
A <= A_reg;
A_valid <= A_valid_reg;
B <= B_reg;
B_valid <= B_valid_reg;
test_port <= test_port_reg;
-- tready logic: Assert when the state machine is ready to accept data
tready <= '1' when ((state = RECEIVE_A and a_count < 4 and A_valid_reg='0') or (state = RECEIVE_B and b_count < 4 and B_valid_reg='0')) else
'0';
end architecture;
file reader:
library IEEE;
use IEEE.STD_LOGIC_1164.all;
use IEEE.NUMERIC_STD.all;
use STD.TEXTIO.all;
use IEEE.std_logic_textio.all;
entity file_to_axi_stream is
port (
clk : in std_logic;
rst : in std_logic;
-- AXI Stream output interface
m_axis_tdata : out std_logic_vector(7 downto 0);
m_axis_tvalid : out std_logic;
m_axis_tready : in std_logic;
m_axis_tlast : out std_logic
);
end file_to_axi_stream;
architecture Behavioral of file_to_axi_stream is
signal valid : std_logic := '0';
signal last : std_logic := '0';
signal s_tdata : std_logic_vector(7 downto 0) := (others=>'0');
signal file_opened : boolean := true;
signal total_length : std_logic_vector(31 downto 0) := (others=>'0');
file input_file : text;
-- TODO read same file N times
begin
file_open(input_file, "/tmp/stimulus_input.txt", read_mode);
process (clk, rst)
variable row_input : line;
variable v_number : std_logic_vector(7 downto 0);
variable count : integer := 0; -- Used variable to count the first 4 bytes (for total length)
begin
if (rising_edge(clk)) then
if (rst = '1') then
valid <= '0';
last <= '0';
else
last <= '0';
valid <= '1';
if(file_opened=true) then
if(count=(to_integer(unsigned(total_length))+4)) then
last <= '1';
file_opened <= false;
else
if(m_axis_tready='1') then
readline(input_file, row_input);
hread(row_input, v_number);
s_tdata <= v_number;
if(count<4) then
total_length(count*8+7 downto count*8) <= v_number;
end if;
count := count + 1;
end if;
end if;
else
valid <= '0';
last <= '0';
s_tdata <= (others=>'0');
end if;
end if;
end if;
end process;
m_axis_tvalid <= valid;
m_axis_tdata <= s_tdata;
m_axis_tlast <= last;
end Behavioral;
testbench:
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
entity tb_a2m is
end tb_a2m;
architecture sim of tb_a2m is
-- Signals to connect to the DUT
signal clk : std_logic := '0';
signal reset : std_logic := '0';
signal tdata : std_logic_vector(7 downto 0) := (others => '0');
signal tvalid : std_logic := '0';
signal tready : std_logic;
signal tlast : std_logic := '0';
signal test_port : std_logic_vector(7 downto 0) := (others => '0');
signal A : std_logic_vector(31 downto 0);
signal A_valid : std_logic;
signal A_ready : std_logic := '0';
signal B : std_logic_vector(31 downto 0);
signal B_valid : std_logic;
signal B_ready : std_logic := '0';
-- Clock period definition
constant clk_period : time := 10 ns;
component file_to_axi_stream
port (
clk : in std_logic;
rst : in std_logic;
-- AXI Stream output interface
m_axis_tdata : out std_logic_vector(7 downto 0);
m_axis_tvalid : out std_logic;
m_axis_tready : in std_logic;
m_axis_tlast : out std_logic
);
end component;
begin
-- DUT Instantiation
uut: entity work.a2m
port map (
clk => clk,
reset => reset,
tdata => tdata,
tvalid => tvalid,
tready => tready,
tlast => tlast,
test_port => test_port,
A => A,
A_valid => A_valid,
A_ready => A_ready,
B => B,
B_valid => B_valid,
B_ready => B_ready
);
-- Clock generation
clk_process : process
begin
clk <= '1';
wait for clk_period/2;
clk <= '0';
wait for clk_period/2;
end process;
file_reader : file_to_axi_stream port map(
clk => clk,
rst => reset,
m_axis_tdata => tdata,
m_axis_tvalid => tvalid,
m_axis_tready => tready,
m_axis_tlast => tlast
);
-- Stimulus process
stim_proc: process
begin
-- Initialize
reset <= '1';
wait for 2*clk_period;
reset <= '0';
wait for 6*clk_period;
A_ready <= '1';
wait for 1*clk_period;
A_ready <= '0';
wait for 10*clk_period;
B_ready <= '1';
wait for 1*clk_period;
B_ready <= '0';
wait;
end process;
end architecture;
Ignore that the file to read and convert to axi stream reads the first 4 bytes to get how many bytes are in total in the file.
The file to test simply looks like this:
AA
BB
CC
DD
11
22
33
44
AA
BB
CC
DD
11
22
33
44
I am testing under ubuntu 22.04 with vivado 2024.1.
Any help will be much appreciated.
The problem is in the simulation. All inputs are set on the rising edge of the clock. I shifted the clock process to:
-- Clock generation
clk_process : process
begin
clk <= '0';
wait for clk_period/2;
clk <= '1';
wait for clk_period/2;
end process;
Also, the process in the file_reader
is also changed from rising_edge to falling_edge.
Now, at every rising edge of the clock, the module being tested has all inputs (stimulus) already set when its process is triggered.