FPGA – How to Transfer Verilog Code to For Loops Syntax

fpgasynthesisverilog

I have built an block that does simple moving average on a "factor" numbers in the vector.. and its works good for my needs.

My problem with it that I think my syntax is bit dumb.
I have an array and I push my data into the array and using the factor trigger (can be 2,4,8,16,32) I accumulate the data signals and than make right shift to divide by the factor value.

I read that using for loops is not recomendded but maybe its just nonsense and for loops in synthesis is perfectly fine.

someone can confirm that using for loop for my need is fine and how it can be done here so it be can be synthesized?

my code:

module average #
(
    --parameters--
)
(
    input                           clk,
    input                           rst_n,
    input  [FACTOR_WIDTH-1 : 0]     factor,  // Average (2, 4, 8, 16, 32)
    input  [INPUT_WIDTH-1  : 0]     din,

    --more inputs outputs--

);

    reg         [INPUT_WIDTH-1 :0]  din_dly [0:32-1];
    reg         [OUTPUT_WIDTH-1:0]  dout_sum;
    reg         [OUTPUT_WIDTH-1:0]  dout_shift;


    initial begin
        dout_sum    = {OUTPUT_WIDTH{1'b0}};
        dout_shift  = {OUTPUT_WIDTH{1'b0}};

        for (index = 0; index < 32; index = index + 1) begin
            din_dly[index]  = {INPUT_WIDTH{1'b0}};
        end
    end


    always @(posedge clk or negedge rst_n) begin : average_logic
        if (~rst_n) begin
            dout_sum     <= {OUTPUT_WIDTH{1'b0}};
            dout_shift   <= {OUTPUT_WIDTH{1'b0}};
            --flags=0--
        end else begin
            if (--flags--) begin
                if (factor == 2) begin
                    dout_sum     <= din_dly[0] + din_dly[1];
                    dout_shift   <= dout_sum>>1;    //dout_sum / 2;
                end
                if (factor == 4) begin
                    dout_sum     <= din_dly[0] + din_dly[1] + din_dly[2] + din_dly[3];
                    dout_shift   <= dout_sum>>2;    //dout_sum / 4;
                end
                if (factor == 8) begin
                    dout_sum     <= din_dly[0] + din_dly[1] + din_dly[2] + din_dly[3]
                                 + din_dly[4] + din_dly[5] + din_dly[6] + din_dly[7];                      
                    dout_shift   <= dout_sum>>3;    //dout_sum / 8;
                end
                if (factor == 16) begin
                    dout_sum     <= din_dly[0] + din_dly[1] + din_dly[2] + din_dly[3]
                                 + din_dly[4] + din_dly[5] + din_dly[6] + din_dly[7]
                                 + din_dly[8] + din_dly[9] + din_dly[10] + din_dly[11]
                                 + din_dly[12] + din_dly[13] + din_dly[14] + din_dly[15];                      
                    dout_shift   <= dout_sum>>4;    //dout_sum / 16;
                end
                if (factor == 32) begin
                    dout_sum     <= din_dly[0] + din_dly[1] + din_dly[2] + din_dly[3]
                                 + din_dly[4] + din_dly[5] + din_dly[6] + din_dly[7]
                                 + din_dly[8] + din_dly[9] + din_dly[10] + din_dly[11]
                                 + din_dly[12] + din_dly[13] + din_dly[14] + din_dly[15]
                                 + din_dly[16] + din_dly[17] + din_dly[18] + din_dly[19]
                                 + din_dly[20] + din_dly[21] + din_dly[22] + din_dly[23]                       
                                 + din_dly[24] + din_dly[25] + din_dly[26] + din_dly[27]
                                 + din_dly[28] + din_dly[29] + din_dly[30] + din_dly[31];
                    dout_shift   <= dout_sum>>5;    //dout_sum / 32;
                end

                --logic--

            end else begin
                --logic--
            end
        end
    end

    always @(posedge clk or negedge rst_n) begin
        if (~rst_n) begin
            din_dly[0]  <= {INPUT_WIDTH{1'b0}};
            din_dly[1]  <= {INPUT_WIDTH{1'b0}};
            din_dly[2]  <= {INPUT_WIDTH{1'b0}};
            din_dly[3]  <= {INPUT_WIDTH{1'b0}};
            din_dly[4]  <= {INPUT_WIDTH{1'b0}};
            din_dly[5]  <= {INPUT_WIDTH{1'b0}};
            din_dly[6]  <= {INPUT_WIDTH{1'b0}};
            din_dly[7]  <= {INPUT_WIDTH{1'b0}};
            din_dly[8]  <= {INPUT_WIDTH{1'b0}};
            din_dly[9]  <= {INPUT_WIDTH{1'b0}};
            din_dly[10] <= {INPUT_WIDTH{1'b0}};
            din_dly[11] <= {INPUT_WIDTH{1'b0}};
            din_dly[12] <= {INPUT_WIDTH{1'b0}};
            din_dly[13] <= {INPUT_WIDTH{1'b0}};
            din_dly[14] <= {INPUT_WIDTH{1'b0}};
            din_dly[15] <= {INPUT_WIDTH{1'b0}};
            din_dly[16] <= {INPUT_WIDTH{1'b0}};
            din_dly[17] <= {INPUT_WIDTH{1'b0}};
            din_dly[18] <= {INPUT_WIDTH{1'b0}};
            din_dly[19] <= {INPUT_WIDTH{1'b0}};
            din_dly[20] <= {INPUT_WIDTH{1'b0}};
            din_dly[21] <= {INPUT_WIDTH{1'b0}};
            din_dly[22] <= {INPUT_WIDTH{1'b0}};
            din_dly[23] <= {INPUT_WIDTH{1'b0}};
            din_dly[24] <= {INPUT_WIDTH{1'b0}};
            din_dly[25] <= {INPUT_WIDTH{1'b0}};
            din_dly[26] <= {INPUT_WIDTH{1'b0}};
            din_dly[27] <= {INPUT_WIDTH{1'b0}};
            din_dly[28] <= {INPUT_WIDTH{1'b0}};
            din_dly[29] <= {INPUT_WIDTH{1'b0}};
            din_dly[30] <= {INPUT_WIDTH{1'b0}};
            din_dly[31] <= {INPUT_WIDTH{1'b0}};
        end else begin
            if (--flag--) begin
                if (factor == 2) begin
                    din_dly[0]  <= din;
                    din_dly[1]  <= din_dly[0];
                end
                if (factor == 4) begin
                    din_dly[0]  <= din;
                    din_dly[1]  <= din_dly[0];
                    din_dly[2]  <= din_dly[1];
                    din_dly[3]  <= din_dly[2];
                end
                if (factor == 8) begin
                    din_dly[0]  <= din;
                    din_dly[1]  <= din_dly[0];
                    din_dly[2]  <= din_dly[1];
                    din_dly[3]  <= din_dly[2];
                    din_dly[4]  <= din_dly[3];
                    din_dly[5]  <= din_dly[4];
                    din_dly[6]  <= din_dly[5];
                    din_dly[7]  <= din_dly[6];
                end
                if (factor == 16) begin
                    din_dly[0]  <= din;
                    din_dly[1]  <= din_dly[0];
                    din_dly[2]  <= din_dly[1];
                    din_dly[3]  <= din_dly[2];
                    din_dly[4]  <= din_dly[3];
                    din_dly[5]  <= din_dly[4];
                    din_dly[6]  <= din_dly[5];
                    din_dly[7]  <= din_dly[6];
                    din_dly[8]  <= din_dly[7];
                    din_dly[9]  <= din_dly[8];
                    din_dly[10] <= din_dly[9];
                    din_dly[11] <= din_dly[10];
                    din_dly[12] <= din_dly[11];
                    din_dly[13] <= din_dly[12];
                    din_dly[14] <= din_dly[13];
                    din_dly[15] <= din_dly[14];
                end
                if (factor == 32) begin
                    din_dly[0]  <= din;
                    din_dly[1]  <= din_dly[0];
                    din_dly[2]  <= din_dly[1];
                    din_dly[3]  <= din_dly[2];
                    din_dly[4]  <= din_dly[3];
                    din_dly[5]  <= din_dly[4];
                    din_dly[6]  <= din_dly[5];
                    din_dly[7]  <= din_dly[6];
                    din_dly[8]  <= din_dly[7];
                    din_dly[9]  <= din_dly[8];
                    din_dly[10] <= din_dly[9];
                    din_dly[11] <= din_dly[10];
                    din_dly[12] <= din_dly[11];
                    din_dly[13] <= din_dly[12];
                    din_dly[14] <= din_dly[13];
                    din_dly[15] <= din_dly[14];
                    din_dly[16] <= din_dly[15];
                    din_dly[17] <= din_dly[16];
                    din_dly[18] <= din_dly[17];
                    din_dly[19] <= din_dly[18];
                    din_dly[20] <= din_dly[19];
                    din_dly[21] <= din_dly[20];
                    din_dly[22] <= din_dly[21];
                    din_dly[23] <= din_dly[22];
                    din_dly[24] <= din_dly[23];
                    din_dly[25] <= din_dly[24];
                    din_dly[26] <= din_dly[25];
                    din_dly[27] <= din_dly[26];
                    din_dly[28] <= din_dly[27];
                    din_dly[29] <= din_dly[28];
                    din_dly[30] <= din_dly[29];
                    din_dly[31] <= din_dly[30];
                end
            end
            if (--some flags--) begin
                din_dly[0]  <= {INPUT_WIDTH{1'b0}};
                din_dly[1]  <= {INPUT_WIDTH{1'b0}};
                din_dly[2]  <= {INPUT_WIDTH{1'b0}};
                din_dly[3]  <= {INPUT_WIDTH{1'b0}};
                din_dly[4]  <= {INPUT_WIDTH{1'b0}};
                din_dly[5]  <= {INPUT_WIDTH{1'b0}};
                din_dly[6]  <= {INPUT_WIDTH{1'b0}};
                din_dly[7]  <= {INPUT_WIDTH{1'b0}};
                din_dly[8]  <= {INPUT_WIDTH{1'b0}};
                din_dly[9]  <= {INPUT_WIDTH{1'b0}};
                din_dly[10] <= {INPUT_WIDTH{1'b0}};
                din_dly[11] <= {INPUT_WIDTH{1'b0}};
                din_dly[12] <= {INPUT_WIDTH{1'b0}};
                din_dly[13] <= {INPUT_WIDTH{1'b0}};
                din_dly[14] <= {INPUT_WIDTH{1'b0}};
                din_dly[15] <= {INPUT_WIDTH{1'b0}};
                din_dly[16] <= {INPUT_WIDTH{1'b0}};
                din_dly[17] <= {INPUT_WIDTH{1'b0}};
                din_dly[18] <= {INPUT_WIDTH{1'b0}};
                din_dly[19] <= {INPUT_WIDTH{1'b0}};
                din_dly[20] <= {INPUT_WIDTH{1'b0}};
                din_dly[21] <= {INPUT_WIDTH{1'b0}};
                din_dly[22] <= {INPUT_WIDTH{1'b0}};
                din_dly[23] <= {INPUT_WIDTH{1'b0}};
                din_dly[24] <= {INPUT_WIDTH{1'b0}};
                din_dly[25] <= {INPUT_WIDTH{1'b0}};
                din_dly[26] <= {INPUT_WIDTH{1'b0}};
                din_dly[27] <= {INPUT_WIDTH{1'b0}};
                din_dly[28] <= {INPUT_WIDTH{1'b0}};
                din_dly[29] <= {INPUT_WIDTH{1'b0}};
                din_dly[30] <= {INPUT_WIDTH{1'b0}};
                din_dly[31] <= {INPUT_WIDTH{1'b0}};
            end
        end 
    end
endmodule 

thanks!

Best Answer

For loops in Verilog can be used for synthesis purposes. However the caveat is that the number of loops must be known at synthesis because the tools will unroll the loop into hardware. This means your loop limit must be a constant, a parameter/localparam, or a genvar.

As such if you want to use for loops in your code, you'll need to find a way to factor it such that you have a constant number of loops. One such example might be:

reg [FACTOR_WIDTH-0:0] i;

always @ (posedge clk ... ) begin
    ...
    dout_sum = {OUTPUT_WIDTH{1'b0}};
    for (i = 0; i < 32; i = i + 1) begin
         dout_sum = dout_sum + ( (i < factor) ? din_dly[i] : {OUTPUT_WIDTH{1'b0}} );
    end
    ...
end

That way you still have the same number of loops always, but you simply add on (32-factor) zeros on in some of the loops. This will result in a chain of adders and multiplexers which may not give a high f-max.

You would have to reconcile how to do dout_shift. This could be done with a simple lookup table to convert factor into how many bits to shift.


An alternate solution would be a generate for block which makes one set of logic for each different factor.

//logic to convert factor to a value 0-5 representing log2(factor). Can be a simple case statement.
reg [5:0] factorLog2;
...

reg [OUTPUT_WIDTH-1:0] dout_shift_all [5:0];
genvar i;
generate for (i = 0; i < 6; i = i + 1) begin : factor_loop
    // Create the sum - this is the async part of the calculation
    reg [OUTPUT_WIDTH-1:0] dout_sum_factor;
    integer j;
    always @ * begin
        dout_sum_factor = {OUTPUT_WIDTH{1'b0}};
        for (j = 0; j < (1 << i); j = j + 1) begin
             dout_sum_factor = dout_sum_factor + din_dly[j];
        end
    end
    // Pipeline dout_sum_factor
    reg [OUTPUT_WIDTH-1:0] dout_sum_buf;
    always @ (posedge clk) begin
        dout_sum_buf <= dout_sum_factor;
    end
    // Could make pipeline this but would take an extra clock cycle compared to original code
    always @ * begin 
       dout_shift_all[i] = dout_sum_buf >> i;
    end
end endgenerate

//Existing logic tweaked to use dout_shift_all
always @ (posedge clk ... ) begin
    ...
    dout_sum_shift = dout_shift_all[factorLog2]; 
    ...
end

This would produce more logic but would be faster as it's more parallel and pipelineable.