fix(decoder): replace serial popcount with balanced adder tree for timing closure

Split SYNDROME state into SYNDROME_S1 (parity computation) + SYNDROME_S2
(popcount) pipeline stages. SYNDROME_S2 uses a 4-level balanced adder tree
(224→56→14→4→1) instead of a serial accumulator loop, eliminating the
loop-carried dependency that Yosys could not optimize. This reduces the
critical path from ~48 ns to ~14 ns, achieving WNS=0.0 at TT corner (50 MHz).

Verilator verified: 2/2 basic + 20/20 vector tests pass.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
cah
2026-03-05 05:18:57 -07:00
parent f2901c6366
commit 77103f68c6

View File

@@ -118,7 +118,8 @@ module ldpc_decoder_core #(
LAYER_READ, // Read Z beliefs for each of DC columns in current row LAYER_READ, // Read Z beliefs for each of DC columns in current row
CN_UPDATE, // Run min-sum CN update on gathered messages CN_UPDATE, // Run min-sum CN update on gathered messages
LAYER_WRITE, // Write updated beliefs and new CN->VN messages LAYER_WRITE, // Write updated beliefs and new CN->VN messages
SYNDROME, // Check syndrome after full iteration SYNDROME_S1, // Syndrome pipeline stage 1: compute parity bits
SYNDROME_S2, // Syndrome pipeline stage 2: popcount parity vector
SYNDROME_DONE, // Read registered syndrome result SYNDROME_DONE, // Read registered syndrome result
DONE DONE
} state_t; } state_t;
@@ -134,10 +135,16 @@ module ldpc_decoder_core #(
logic signed [Q-1:0] vn_to_cn [DC][Z]; // VN->CN messages for current row logic signed [Q-1:0] vn_to_cn [DC][Z]; // VN->CN messages for current row
logic signed [Q-1:0] cn_to_vn [DC][Z]; // new CN->VN messages (output of min-sum) logic signed [Q-1:0] cn_to_vn [DC][Z]; // new CN->VN messages (output of min-sum)
// Syndrome check // Syndrome pipeline registers
logic [M_BASE*Z-1:0] parity_vec; // 224-bit registered parity results
logic [7:0] syndrome_cnt; logic [7:0] syndrome_cnt;
logic syndrome_ok; logic syndrome_ok;
// Popcount balanced adder tree intermediates (combinational)
logic [2:0] pc_l1 [56]; // Level 1: 56 groups of 4 bits → 3-bit counts
logic [4:0] pc_l2 [14]; // Level 2: 14 groups of 4 → 5-bit counts
logic [6:0] pc_l3 [4]; // Level 3: 4 groups → 7-bit counts
assign effective_max_iter = (max_iter == 0) ? MAX_ITER[4:0] : max_iter; assign effective_max_iter = (max_iter == 0) ? MAX_ITER[4:0] : max_iter;
assign busy = (state != IDLE) && (state != DONE); assign busy = (state != IDLE) && (state != DONE);
@@ -163,12 +170,13 @@ module ldpc_decoder_core #(
LAYER_WRITE: begin LAYER_WRITE: begin
if (col_idx == N_BASE - 1) begin if (col_idx == N_BASE - 1) begin
if (row_idx == M_BASE - 1) if (row_idx == M_BASE - 1)
state_next = SYNDROME; state_next = SYNDROME_S1;
else else
state_next = LAYER_READ; // next row state_next = LAYER_READ; // next row
end end
end end
SYNDROME: state_next = SYNDROME_DONE; SYNDROME_S1: state_next = SYNDROME_S2;
SYNDROME_S2: state_next = SYNDROME_DONE;
SYNDROME_DONE: begin SYNDROME_DONE: begin
if (syndrome_ok && early_term_en) if (syndrome_ok && early_term_en)
state_next = DONE; state_next = DONE;
@@ -312,10 +320,9 @@ module ldpc_decoder_core #(
end end
end end
SYNDROME: begin // Syndrome Pipeline Stage 1: Compute parity bits (register)
// Check H * c_hat == 0 (compute syndrome weight) // Each parity is only 2-3 XOR levels deep (~3-4 ns)
// Only include connected columns (H_BASE >= 0) SYNDROME_S1: begin
syndrome_cnt = '0;
for (int r = 0; r < M_BASE; r++) begin for (int r = 0; r < M_BASE; r++) begin
for (int z = 0; z < Z; z++) begin for (int z = 0; z < Z; z++) begin
logic parity; logic parity;
@@ -328,9 +335,35 @@ module ldpc_decoder_core #(
parity = parity ^ beliefs[bit_idx][Q-1]; parity = parity ^ beliefs[bit_idx][Q-1];
end end
end end
if (parity) syndrome_cnt = syndrome_cnt + 1; parity_vec[r * Z + z] <= parity;
end end
end end
end
// Syndrome Pipeline Stage 2: Popcount registered parity vector
// 224-bit popcount via adder tree (~14 ns)
SYNDROME_S2: begin
// Balanced 4-wide adder tree popcount (no loop-carried dependency)
// Level 1: 56 groups of 4 bits → 3-bit counts
for (int i = 0; i < 56; i++)
pc_l1[i] = {2'b0, parity_vec[4*i]} + {2'b0, parity_vec[4*i+1]} +
{2'b0, parity_vec[4*i+2]} + {2'b0, parity_vec[4*i+3]};
// Level 2: 14 groups of 4 three-bit counts → 5-bit counts
for (int i = 0; i < 14; i++)
pc_l2[i] = {2'b0, pc_l1[4*i]} + {2'b0, pc_l1[4*i+1]} +
{2'b0, pc_l1[4*i+2]} + {2'b0, pc_l1[4*i+3]};
// Level 3: 14 → 4 (3 groups of 4 + 1 group of 2) → 7-bit counts
pc_l3[0] = {2'b0, pc_l2[0]} + {2'b0, pc_l2[1]} + {2'b0, pc_l2[2]} + {2'b0, pc_l2[3]};
pc_l3[1] = {2'b0, pc_l2[4]} + {2'b0, pc_l2[5]} + {2'b0, pc_l2[6]} + {2'b0, pc_l2[7]};
pc_l3[2] = {2'b0, pc_l2[8]} + {2'b0, pc_l2[9]} + {2'b0, pc_l2[10]} + {2'b0, pc_l2[11]};
pc_l3[3] = {2'b0, pc_l2[12]} + {2'b0, pc_l2[13]};
// Level 4: final sum → 8-bit count
syndrome_cnt = {1'b0, pc_l3[0]} + {1'b0, pc_l3[1]} +
{1'b0, pc_l3[2]} + {1'b0, pc_l3[3]};
syndrome_weight <= syndrome_cnt; syndrome_weight <= syndrome_cnt;
syndrome_ok <= (syndrome_cnt == 0); syndrome_ok <= (syndrome_cnt == 0);