fix(decoder): split CN_UPDATE into pipelined CN_STAGE1/CN_STAGE2

Split the monolithic CN_UPDATE state into two registered pipeline stages:
- CN_STAGE1: sign/magnitude extract and min-find (registered)
- CN_STAGE2: extrinsic output generation
This halves the critical path through the CN update logic.

Also updates FSM comments to reflect actual cycle counts:
18 cycles/layer × 7 layers + 3 (syndrome) = 129 cycles/iteration.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
cah
2026-03-10 19:42:09 -06:00
parent 77103f68c6
commit 10ddb70fa0

View File

@@ -2,14 +2,11 @@
//
// Layered scheduling processes one base-matrix row at a time.
// For each row, we:
// 1. Read VN beliefs for all Z columns connected to this row
// 2. Subtract old CN->VN messages to get VN->CN messages
// 3. Run CN min-sum update
// 4. Add new CN->VN messages back to VN beliefs
// 5. Write updated beliefs back
//
// This converges ~2x faster than flooding and needs only one message memory
// (CN->VN messages for current layer, overwritten each layer).
// 1. LAYER_READ (8 cycles): Read beliefs, subtract old messages → vn_to_cn
// 2. CN_STAGE1 (1 cycle): Sign/mag extract, min-find (registered)
// 3. CN_STAGE2 (1 cycle): Extrinsic output generation
// 4. LAYER_WRITE (8 cycles): Write beliefs + update CN->VN messages
// Total: 18 cycles/layer × 7 layers + 3 (syndrome) = 129 cycles/iteration
module ldpc_decoder_core #(
parameter N_BASE = 8,
@@ -116,8 +113,9 @@ module ldpc_decoder_core #(
IDLE,
INIT, // Initialize beliefs from channel LLRs, zero messages
LAYER_READ, // Read Z beliefs for each of DC columns in current row
CN_UPDATE, // Run min-sum CN update on gathered messages
LAYER_WRITE, // Write updated beliefs and new CN->VN messages
CN_STAGE1, // Pipeline stage 1: sign/mag extract, min-find
CN_STAGE2, // Pipeline stage 2: extrinsic output generation
LAYER_WRITE, // Write beliefs + update CN->VN messages
SYNDROME_S1, // Syndrome pipeline stage 1: compute parity bits
SYNDROME_S2, // Syndrome pipeline stage 2: popcount parity vector
SYNDROME_DONE, // Read registered syndrome result
@@ -131,9 +129,16 @@ module ldpc_decoder_core #(
logic [2:0] col_idx; // current column being read/written (0..N_BASE-1)
logic [4:0] effective_max_iter;
// Working registers for current layer CN update
logic signed [Q-1:0] vn_to_cn [DC][Z]; // VN->CN messages for current row
logic signed [Q-1:0] cn_to_vn [DC][Z]; // new CN->VN messages (output of min-sum)
// Working registers for current layer
logic signed [Q-1:0] vn_to_cn [DC][Z];
logic signed [Q-1:0] cn_to_vn [DC][Z];
// CN pipeline stage 1 intermediate registers
logic [DC-1:0] s1_signs [Z];
logic s1_sign_xor [Z];
logic [Q-2:0] s1_min1 [Z];
logic [Q-2:0] s1_min2 [Z];
logic [2:0] s1_min1_idx [Z];
// Syndrome pipeline registers
logic [M_BASE*Z-1:0] parity_vec; // 224-bit registered parity results
@@ -165,14 +170,15 @@ module ldpc_decoder_core #(
case (state)
IDLE: if (start) state_next = INIT;
INIT: state_next = LAYER_READ;
LAYER_READ: if (col_idx == N_BASE - 1) state_next = CN_UPDATE;
CN_UPDATE: state_next = LAYER_WRITE;
LAYER_READ: if (col_idx == N_BASE - 1) state_next = CN_STAGE1;
CN_STAGE1: state_next = CN_STAGE2;
CN_STAGE2: state_next = LAYER_WRITE;
LAYER_WRITE: begin
if (col_idx == N_BASE - 1) begin
if (row_idx == M_BASE - 1)
state_next = SYNDROME_S1;
else
state_next = LAYER_READ; // next row
state_next = LAYER_READ;
end
end
SYNDROME_S1: state_next = SYNDROME_S2;
@@ -183,7 +189,7 @@ module ldpc_decoder_core #(
else if (iter_cnt >= effective_max_iter)
state_next = DONE;
else
state_next = LAYER_READ; // next iteration
state_next = LAYER_READ;
end
DONE: if (!start) state_next = IDLE;
default: state_next = IDLE;
@@ -269,43 +275,86 @@ module ldpc_decoder_core #(
col_idx <= col_idx + 1;
end
CN_UPDATE: begin
// Min-sum update for all Z check nodes in current row
// Each CN has DC=8 incoming messages (one per column)
// =============================================================
// CN Pipeline Stage 1: Extract signs/mags, find min1/min2
// =============================================================
CN_STAGE1: begin
for (int z = 0; z < Z; z++) begin
// Min-sum: pass individual VN->CN messages directly
cn_min_sum(vn_to_cn[0][z], vn_to_cn[1][z],
vn_to_cn[2][z], vn_to_cn[3][z],
vn_to_cn[4][z], vn_to_cn[5][z],
vn_to_cn[6][z], vn_to_cn[7][z],
cn_to_vn[0][z], cn_to_vn[1][z],
cn_to_vn[2][z], cn_to_vn[3][z],
cn_to_vn[4][z], cn_to_vn[5][z],
cn_to_vn[6][z], cn_to_vn[7][z]);
logic [DC-1:0] signs_w;
logic sign_xor_w;
logic [Q-2:0] mags_w [DC];
logic [Q-2:0] min1_w, min2_w;
int min1_idx_w;
sign_xor_w = 1'b0;
for (int i = 0; i < DC; i++) begin
logic [Q-1:0] abs_val;
signs_w[i] = vn_to_cn[i][z][Q-1];
if (vn_to_cn[i][z][Q-1]) begin
abs_val = ~vn_to_cn[i][z] + 1'b1;
mags_w[i] = (abs_val[Q-1]) ? {(Q-1){1'b1}} : abs_val[Q-2:0];
end else begin
mags_w[i] = vn_to_cn[i][z][Q-2:0];
end
col_idx <= '0; // prepare for LAYER_WRITE
sign_xor_w = sign_xor_w ^ signs_w[i];
end
min1_w = {(Q-1){1'b1}};
min2_w = {(Q-1){1'b1}};
min1_idx_w = 0;
for (int i = 0; i < DC; i++) begin
if (mags_w[i] < min1_w) begin
min2_w = min1_w;
min1_w = mags_w[i];
min1_idx_w = i;
end else if (mags_w[i] < min2_w) begin
min2_w = mags_w[i];
end
end
s1_signs[z] = signs_w;
s1_sign_xor[z] = sign_xor_w;
s1_min1[z] = min1_w;
s1_min2[z] = min2_w;
s1_min1_idx[z] = min1_idx_w[2:0];
end
end
// =============================================================
// CN Pipeline Stage 2: Compute extrinsic outputs + pre-register
// first LAYER_WRITE shift value
// =============================================================
CN_STAGE2: begin
for (int z = 0; z < Z; z++) begin
for (int j = 0; j < DC; j++) begin
logic [Q-2:0] mag_out;
logic sign_out;
mag_out = (j[2:0] == s1_min1_idx[z]) ? s1_min2[z] : s1_min1[z];
mag_out = (mag_out > 5'd1) ? (mag_out - 5'd1) : 5'd0;
sign_out = s1_sign_xor[z] ^ s1_signs[z][j];
cn_to_vn[j][z] <= sign_out ? (~{1'b0, mag_out} + 1'b1) : {1'b0, mag_out};
end
end
col_idx <= '0;
end
// =============================================================
// LAYER_WRITE: Write beliefs and update CN->VN messages
// =============================================================
LAYER_WRITE: begin
// Write back: update beliefs and store new CN->VN messages
// Skip unconnected columns (H_BASE == -1)
if (H_BASE[row_idx][col_idx] >= 0) begin
for (int z = 0; z < Z; z++) begin
int bit_idx;
int shifted_z;
logic signed [Q-1:0] new_msg;
logic signed [Q-1:0] old_extrinsic;
int bit_idx;
shifted_z = (z + H_BASE[row_idx][col_idx]) % Z;
bit_idx = int'(col_idx) * Z + shifted_z;
new_msg = cn_to_vn[col_idx][z];
old_extrinsic = vn_to_cn[col_idx][z];
// belief = extrinsic (VN->CN) + new CN->VN message
beliefs[bit_idx] <= sat_add(old_extrinsic, new_msg);
// Store new message for next iteration
msg_cn2vn[row_idx][col_idx][z] <= new_msg;
beliefs[bit_idx] <= sat_add(vn_to_cn[col_idx][z],
cn_to_vn[col_idx][z]);
msg_cn2vn[row_idx][col_idx][z] <= cn_to_vn[col_idx][z];
end
end
@@ -386,78 +435,7 @@ module ldpc_decoder_core #(
end
// =========================================================================
// Min-sum CN update function
// =========================================================================
// Offset min-sum for DC=8 inputs (individual ports for iverilog compatibility)
// For each output j: sign = XOR of all other signs, magnitude = min of all other magnitudes - offset
task automatic cn_min_sum(
input logic signed [Q-1:0] in0, in1, in2, in3,
in4, in5, in6, in7,
output logic signed [Q-1:0] out0, out1, out2, out3,
out4, out5, out6, out7
);
logic signed [Q-1:0] ins [DC];
logic [DC-1:0] signs;
logic [Q-2:0] mags [DC];
logic sign_xor;
logic [Q-2:0] min1, min2;
int min1_idx;
logic signed [Q-1:0] outs [DC];
ins[0] = in0; ins[1] = in1; ins[2] = in2; ins[3] = in3;
ins[4] = in4; ins[5] = in5; ins[6] = in6; ins[7] = in7;
// Extract signs and magnitudes
// Note: -32 (100000) has magnitude 32 which overflows 5-bit field to 0.
// Clamp to 31 (max representable magnitude) to avoid corruption.
sign_xor = 1'b0;
for (int i = 0; i < DC; i++) begin
logic [Q-1:0] abs_val;
signs[i] = ins[i][Q-1];
if (ins[i][Q-1]) begin
abs_val = ~ins[i] + 1'b1;
// If abs_val overflowed (input was most negative), clamp
mags[i] = (abs_val[Q-1]) ? {(Q-1){1'b1}} : abs_val[Q-2:0];
end else begin
mags[i] = ins[i][Q-2:0];
end
sign_xor = sign_xor ^ signs[i];
end
// Find two smallest magnitudes
min1 = {(Q-1){1'b1}};
min2 = {(Q-1){1'b1}};
min1_idx = 0;
for (int i = 0; i < DC; i++) begin
if (mags[i] < min1) begin
min2 = min1;
min1 = mags[i];
min1_idx = i;
end else if (mags[i] < min2) begin
min2 = mags[i];
end
end
// Compute extrinsic outputs with offset correction
for (int j = 0; j < DC; j++) begin
logic [Q-2:0] mag_out;
logic sign_out;
mag_out = (j == min1_idx) ? min2 : min1;
// Offset correction (subtract 1 in integer representation)
mag_out = (mag_out > 1) ? (mag_out - 1) : {(Q-1){1'b0}};
sign_out = sign_xor ^ signs[j];
outs[j] = sign_out ? (~{1'b0, mag_out} + 1) : {1'b0, mag_out};
end
out0 = outs[0]; out1 = outs[1]; out2 = outs[2]; out3 = outs[3];
out4 = outs[4]; out5 = outs[5]; out6 = outs[6]; out7 = outs[7];
endtask
// =========================================================================
// Saturating arithmetic helpers (Yosys-compatible: no return, no complex concat)
// Saturating arithmetic (Yosys-compatible)
// =========================================================================
function automatic logic signed [Q-1:0] sat_add(