长指令写回仲裁与OITF
E203具有两级写回仲裁模块,第二个就是长指令写回仲裁模块(Long-Pipes Instructions Write-Back Arbitration,LPIWBA)
OITF和长指令写回仲裁模块协同合作完成所有长指令的写回操作,长指令写回仲裁主要用于仲裁不同长指令之间的写回,因为这些指令来自不同执行单元、执行的周期数不同、执行的顺序不同、写回的地方不一样,就需要记录这些指令的先后关系,这就用到了OITF
OITF在之前的执行部分已经介绍过,它本质上是一个记录还未写回但是已经在执行的长指令的FIFO。每个被派遣的长指令都会在OITF中分配一个表项(Entry),这个表项的FIFO指针就作为这个长指令的ITAG,长指令不管被派遣到任何运算单元都会携带这个ITAG,同时写回时也要带着相同的ITAG
OITF的深度就决定了能够派遣的滞外(Outstanding,也就是OITF中的“O”)长指令的个数。为了硬件实现的简洁,蜂鸟E203采用严格按照OITF的顺序写回到方法——OITF的读指针会指向最先进入此FIFO的表项,通过使用此读指针作为长指令写回仲裁的选择参考,就可以保证不同长指令的写回顺序和派遣顺序严格一致。
每次长指令写回仲裁模块成功写回一个长指令后,对应地OITF表项就被从FIFO中退出了
由于有些长指令可能发生执行错误,因此需要产生异常——长指令写回仲裁模块会和交付模块产生接口触发异常,如果长指令产生异常,则不会真正写回,而是在接口部分就被丢弃。
有关FWBA的代码部分见上面的源码
长指令写回仲裁和OITF部分的源码位于rtl/e203/core/e203_exu_disp.v、rtl/e203/core/e203_exu_oitf.v、rtl/e203/core/e203_exu_longwbck.v三个文件
这里直接复制粘贴了全部源码,推荐系统地看一下三个文件的代码来更好地理解实现思路
/* e203_exu_disp */
module e203_exu_disp(
input wfi_halt_exu_req,
output wfi_halt_exu_ack,
input oitf_empty,
input amo_wait,
//
// The operands and decode info from dispatch
input disp_i_valid, // Handshake valid
output disp_i_ready, // Handshake ready
// The operand 1/2 read-enable signals and indexes
input disp_i_rs1x0,
input disp_i_rs2x0,
input disp_i_rs1en,
input disp_i_rs2en,
input [`E203_RFIDX_WIDTH-1:0] disp_i_rs1idx,
input [`E203_RFIDX_WIDTH-1:0] disp_i_rs2idx,
input [`E203_XLEN-1:0] disp_i_rs1,
input [`E203_XLEN-1:0] disp_i_rs2,
input disp_i_rdwen,
input [`E203_RFIDX_WIDTH-1:0] disp_i_rdidx,
input [`E203_DECINFO_WIDTH-1:0] disp_i_info,
input [`E203_XLEN-1:0] disp_i_imm,
input [`E203_PC_SIZE-1:0] disp_i_pc,
input disp_i_misalgn,
input disp_i_buserr ,
input disp_i_ilegl ,
//
// Dispatch to ALU
output disp_o_alu_valid,
input disp_o_alu_ready,
input disp_o_alu_longpipe,
output [`E203_XLEN-1:0] disp_o_alu_rs1,
output [`E203_XLEN-1:0] disp_o_alu_rs2,
output disp_o_alu_rdwen,
output [`E203_RFIDX_WIDTH-1:0] disp_o_alu_rdidx,
output [`E203_DECINFO_WIDTH-1:0] disp_o_alu_info,
output [`E203_XLEN-1:0] disp_o_alu_imm,
output [`E203_PC_SIZE-1:0] disp_o_alu_pc,
output [`E203_ITAG_WIDTH-1:0] disp_o_alu_itag,
output disp_o_alu_misalgn,
output disp_o_alu_buserr ,
output disp_o_alu_ilegl ,
//
// Dispatch to OITF
input oitfrd_match_disprs1,
input oitfrd_match_disprs2,
input oitfrd_match_disprs3,
input oitfrd_match_disprd,
input [`E203_ITAG_WIDTH-1:0] disp_oitf_ptr ,
output disp_oitf_ena,
input disp_oitf_ready,
output disp_oitf_rs1fpu,
output disp_oitf_rs2fpu,
output disp_oitf_rs3fpu,
output disp_oitf_rdfpu ,
output disp_oitf_rs1en ,
output disp_oitf_rs2en ,
output disp_oitf_rs3en ,
output disp_oitf_rdwen ,
output [`E203_RFIDX_WIDTH-1:0] disp_oitf_rs1idx,
output [`E203_RFIDX_WIDTH-1:0] disp_oitf_rs2idx,
output [`E203_RFIDX_WIDTH-1:0] disp_oitf_rs3idx,
output [`E203_RFIDX_WIDTH-1:0] disp_oitf_rdidx ,
output [`E203_PC_SIZE-1:0] disp_oitf_pc ,
input clk,
input rst_n
);
wire [`E203_DECINFO_GRP_WIDTH-1:0] disp_i_info_grp = disp_i_info [`E203_DECINFO_GRP];
// Based on current 2 pipe stage implementation, the 2nd stage need to have all instruction
// to be commited via ALU interface, so every instruction need to be dispatched to ALU,
// regardless it is long pipe or not, and inside ALU it will issue instructions to different
// other longpipes
//wire disp_alu = (disp_i_info_grp == `E203_DECINFO_GRP_ALU)
// | (disp_i_info_grp == `E203_DECINFO_GRP_BJP)
// | (disp_i_info_grp == `E203_DECINFO_GRP_CSR)
// `ifdef E203_SUPPORT_SHARE_MULDIV //{
// | (disp_i_info_grp == `E203_DECINFO_GRP_MULDIV)
// `endif//E203_SUPPORT_SHARE_MULDIV}
// | (disp_i_info_grp == `E203_DECINFO_GRP_AGU);
wire disp_csr = (disp_i_info_grp == `E203_DECINFO_GRP_CSR);
wire disp_alu_longp_prdt = (disp_i_info_grp == `E203_DECINFO_GRP_AGU)
;
wire disp_alu_longp_real = disp_o_alu_longpipe;
// Both fence and fencei need to make sure all outstanding instruction have been completed
wire disp_fence_fencei = (disp_i_info_grp == `E203_DECINFO_GRP_BJP) &
( disp_i_info [`E203_DECINFO_BJP_FENCE] | disp_i_info [`E203_DECINFO_BJP_FENCEI]);
// Since any instruction will need to be dispatched to ALU, we dont need the gate here
// wire disp_i_ready_pos = disp_alu & disp_o_alu_ready;
// assign disp_o_alu_valid = disp_alu & disp_i_valid_pos;
wire disp_i_valid_pos;
wire disp_i_ready_pos = disp_o_alu_ready;
assign disp_o_alu_valid = disp_i_valid_pos;
//
// The Dispatch Scheme Introduction for two-pipeline stage
// #1: The instruction after dispatched must have already have operand fetched, so
// there is no any WAR dependency happened.
// #2: The ALU-instruction are dispatched and executed in-order inside ALU, so
// there is no any WAW dependency happened among ALU instructions.
// Note: LSU since its AGU is handled inside ALU, so it is treated as a ALU instruction
// #3: The non-ALU-instruction are all tracked by OITF, and must be write-back in-order, so
// it is like ALU in-ordered. So there is no any WAW dependency happened among
// non-ALU instructions.
// Then what dependency will we have?
// * RAW: This is the real dependency
// * WAW: The WAW between ALU an non-ALU instructions
//
// So #1, The dispatching ALU instruction can not proceed and must be stalled when
// ** RAW: The ALU reading operands have data dependency with OITF entries
// *** Note: since it is 2 pipeline stage, any last ALU instruction have already
// write-back into the regfile. So there is no chance for ALU instr to depend
// on last ALU instructions as RAW.
// Note: if it is 3 pipeline stages, then we also need to consider the ALU-to-ALU
// RAW dependency.
// ** WAW: The ALU writing result have no any data dependency with OITF entries
// Note: Since the ALU instruction handled by ALU may surpass non-ALU OITF instructions
// so we must check this.
// And #2, The dispatching non-ALU instruction can not proceed and must be stalled when
// ** RAW: The non-ALU reading operands have data dependency with OITF entries
// *** Note: since it is 2 pipeline stage, any last ALU instruction have already
// write-back into the regfile. So there is no chance for non-ALU instr to depend
// on last ALU instructions as RAW.
// Note: if it is 3 pipeline stages, then we also need to consider the non-ALU-to-ALU
// RAW dependency.
wire raw_dep = ((oitfrd_match_disprs1) |
(oitfrd_match_disprs2) |
(oitfrd_match_disprs3));
// Only check the longp instructions (non-ALU) for WAW, here if we
// use the precise version (disp_alu_longp_real), it will hurt timing very much, but
// if we use imprecise version of disp_alu_longp_prdt, it is kind of tricky and in
// some corner case. For example, the AGU (treated as longp) will actually not dispatch
// to longp but just directly commited, then it become a normal ALU instruction, and should
// check the WAW dependency, but this only happened when it is AMO or unaligned-uop, so
// ideally we dont need to worry about it, because
// * We dont support AMO in 2 stage CPU here
// * We dont support Unalign load-store in 2 stage CPU here, which
// will be triggered as exception, so will not really write-back
// into regfile
// * But it depends on some assumption, so it is still risky if in the future something changed.
// Nevertheless: using this condition only waiver the longpipe WAW case, that is, two
// longp instruction write-back same reg back2back. Is it possible or is it common?
// after we checking the benmark result we found if we remove this complexity here
// it just does not change any benchmark number, so just remove that condition out. Means
// all of the instructions will check waw_dep
//wire alu_waw_dep = (~disp_alu_longp_prdt) & (oitfrd_match_disprd & disp_i_rdwen);
wire waw_dep = (oitfrd_match_disprd);
wire dep = raw_dep | waw_dep;
// The WFI halt exu ack will be asserted when the OITF is empty
// and also there is no AMO oustanding uops
assign wfi_halt_exu_ack = oitf_empty & (~amo_wait);
wire disp_condition =
// To be more conservtive, any accessing CSR instruction need to wait the oitf to be empty.
// Theoretically speaking, it should also flush pipeline after the CSR have been updated
// to make sure the subsequent instruction get correct CSR values, but in our 2-pipeline stage
// implementation, CSR is updated after EXU stage, and subsequent are all executed at EXU stage,
// no chance to got wrong CSR values, so we dont need to worry about this.
(disp_csr ? oitf_empty : 1'b1)
// To handle the Fence: just stall dispatch until the OITF is empty
& (disp_fence_fencei ? oitf_empty : 1'b1)
// If it was a WFI instruction commited halt req, then it will stall the disaptch
& (~wfi_halt_exu_req)
// No dependency
& (~dep)
// If dispatch to ALU as long pipeline, then must check
// the OITF is ready
& ((disp_alu & disp_o_alu_longpipe) ? disp_oitf_ready : 1'b1);
// To cut the critical timing path from longpipe signal
// we always assume the LSU will need oitf ready
& (disp_alu_longp_prdt ? disp_oitf_ready : 1'b1);
assign disp_i_valid_pos = disp_condition & disp_i_valid;
assign disp_i_ready = disp_condition & disp_i_ready_pos;
wire [`E203_XLEN-1:0] disp_i_rs1_msked = disp_i_rs1 & {`E203_XLEN{~disp_i_rs1x0}};
wire [`E203_XLEN-1:0] disp_i_rs2_msked = disp_i_rs2 & {`E203_XLEN{~disp_i_rs2x0}};
// Since we always dispatch any instructions into ALU, so we dont need to gate ops here
//assign disp_o_alu_rs1 = {`E203_XLEN{disp_alu}} & disp_i_rs1_msked;
//assign disp_o_alu_rs2 = {`E203_XLEN{disp_alu}} & disp_i_rs2_msked;
//assign disp_o_alu_rdwen = disp_alu & disp_i_rdwen;
//assign disp_o_alu_rdidx = {`E203_RFIDX_WIDTH{disp_alu}} & disp_i_rdidx;
//assign disp_o_alu_info = {`E203_DECINFO_WIDTH{disp_alu}} & disp_i_info;
assign disp_o_alu_rs1 = disp_i_rs1_msked;
assign disp_o_alu_rs2 = disp_i_rs2_msked;
assign disp_o_alu_rdwen = disp_i_rdwen;
assign disp_o_alu_rdidx = disp_i_rdidx;
assign disp_o_alu_info = disp_i_info;
// Why we use precise version of disp_longp here, because
// only when it is really dispatched as long pipe then allocate the OITF
assign disp_oitf_ena = disp_o_alu_valid & disp_o_alu_ready & disp_alu_longp_real;
assign disp_o_alu_imm = disp_i_imm;
assign disp_o_alu_pc = disp_i_pc;
assign disp_o_alu_itag = disp_oitf_ptr;
assign disp_o_alu_misalgn= disp_i_misalgn;
assign disp_o_alu_buserr = disp_i_buserr ;
assign disp_o_alu_ilegl = disp_i_ilegl ;
`ifndef E203_HAS_FPU//{
wire disp_i_fpu = 1'b0;
wire disp_i_fpu_rs1en = 1'b0;
wire disp_i_fpu_rs2en = 1'b0;
wire disp_i_fpu_rs3en = 1'b0;
wire disp_i_fpu_rdwen = 1'b0;
wire [`E203_RFIDX_WIDTH-1:0] disp_i_fpu_rs1idx = `E203_RFIDX_WIDTH'b0;
wire [`E203_RFIDX_WIDTH-1:0] disp_i_fpu_rs2idx = `E203_RFIDX_WIDTH'b0;
wire [`E203_RFIDX_WIDTH-1:0] disp_i_fpu_rs3idx = `E203_RFIDX_WIDTH'b0;
wire [`E203_RFIDX_WIDTH-1:0] disp_i_fpu_rdidx = `E203_RFIDX_WIDTH'b0;
wire disp_i_fpu_rs1fpu = 1'b0;
wire disp_i_fpu_rs2fpu = 1'b0;
wire disp_i_fpu_rs3fpu = 1'b0;
wire disp_i_fpu_rdfpu = 1'b0;
`endif//}
assign disp_oitf_rs1fpu = disp_i_fpu ? (disp_i_fpu_rs1en & disp_i_fpu_rs1fpu) : 1'b0;
assign disp_oitf_rs2fpu = disp_i_fpu ? (disp_i_fpu_rs2en & disp_i_fpu_rs2fpu) : 1'b0;
assign disp_oitf_rs3fpu = disp_i_fpu ? (disp_i_fpu_rs3en & disp_i_fpu_rs3fpu) : 1'b0;
assign disp_oitf_rdfpu = disp_i_fpu ? (disp_i_fpu_rdwen & disp_i_fpu_rdfpu ) : 1'b0;
assign disp_oitf_rs1en = disp_i_fpu ? disp_i_fpu_rs1en : disp_i_rs1en;
assign disp_oitf_rs2en = disp_i_fpu ? disp_i_fpu_rs2en : disp_i_rs2en;
assign disp_oitf_rs3en = disp_i_fpu ? disp_i_fpu_rs3en : 1'b0;
assign disp_oitf_rdwen = disp_i_fpu ? disp_i_fpu_rdwen : disp_i_rdwen;
assign disp_oitf_rs1idx = disp_i_fpu ? disp_i_fpu_rs1idx : disp_i_rs1idx;
assign disp_oitf_rs2idx = disp_i_fpu ? disp_i_fpu_rs2idx : disp_i_rs2idx;
assign disp_oitf_rs3idx = disp_i_fpu ? disp_i_fpu_rs3idx : `E203_RFIDX_WIDTH'b0;
assign disp_oitf_rdidx = disp_i_fpu ? disp_i_fpu_rdidx : disp_i_rdidx;
assign disp_oitf_pc = disp_i_pc;
endmodule
/* e203_exu_oitf */
module e203_exu_oitf (
output dis_ready,
input dis_ena,
input ret_ena,
output [`E203_ITAG_WIDTH-1:0] dis_ptr,
output [`E203_ITAG_WIDTH-1:0] ret_ptr,
output [`E203_RFIDX_WIDTH-1:0] ret_rdidx,
output ret_rdwen,
output ret_rdfpu,
output [`E203_PC_SIZE-1:0] ret_pc,
input disp_i_rs1en,
input disp_i_rs2en,
input disp_i_rs3en,
input disp_i_rdwen,
input disp_i_rs1fpu,
input disp_i_rs2fpu,
input disp_i_rs3fpu,
input disp_i_rdfpu,
input [`E203_RFIDX_WIDTH-1:0] disp_i_rs1idx,
input [`E203_RFIDX_WIDTH-1:0] disp_i_rs2idx,
input [`E203_RFIDX_WIDTH-1:0] disp_i_rs3idx,
input [`E203_RFIDX_WIDTH-1:0] disp_i_rdidx,
input [`E203_PC_SIZE -1:0] disp_i_pc,
output oitfrd_match_disprs1,
output oitfrd_match_disprs2,
output oitfrd_match_disprs3,
output oitfrd_match_disprd,
output oitf_empty,
input clk,
input rst_n
);
wire [`E203_OITF_DEPTH-1:0] vld_set;
wire [`E203_OITF_DEPTH-1:0] vld_clr;
wire [`E203_OITF_DEPTH-1:0] vld_ena;
wire [`E203_OITF_DEPTH-1:0] vld_nxt;
wire [`E203_OITF_DEPTH-1:0] vld_r;
wire [`E203_OITF_DEPTH-1:0] rdwen_r;
wire [`E203_OITF_DEPTH-1:0] rdfpu_r;
wire [`E203_RFIDX_WIDTH-1:0] rdidx_r[`E203_OITF_DEPTH-1:0];
// The PC here is to be used at wback stage to track out the
// PC of exception of long-pipe instruction
wire [`E203_PC_SIZE-1:0] pc_r[`E203_OITF_DEPTH-1:0];
wire alc_ptr_ena = dis_ena;
wire ret_ptr_ena = ret_ena;
wire oitf_full ;
wire [`E203_ITAG_WIDTH-1:0] alc_ptr_r;
wire [`E203_ITAG_WIDTH-1:0] ret_ptr_r;
generate
if(`E203_OITF_DEPTH > 1) begin: depth_gt1//{
wire alc_ptr_**_r;
wire alc_ptr_**_nxt = ~alc_ptr_**_r;
wire alc_ptr_**_ena = (alc_ptr_r == ($unsigned(`E203_OITF_DEPTH-1))) & alc_ptr_ena;
sirv_gnrl_dfflr #(1) alc_ptr_**_dfflrs(alc_ptr_**_ena, alc_ptr_**_nxt, alc_ptr_**_r, clk, rst_n);
wire [`E203_ITAG_WIDTH-1:0] alc_ptr_nxt;
assign alc_ptr_nxt = alc_ptr_**_ena ? `E203_ITAG_WIDTH'b0 : (alc_ptr_r + 1'b1);
sirv_gnrl_dfflr #(`E203_ITAG_WIDTH) alc_ptr_dfflrs(alc_ptr_ena, alc_ptr_nxt, alc_ptr_r, clk, rst_n);
wire ret_ptr_**_r;
wire ret_ptr_**_nxt = ~ret_ptr_**_r;
wire ret_ptr_**_ena = (ret_ptr_r == ($unsigned(`E203_OITF_DEPTH-1))) & ret_ptr_ena;
sirv_gnrl_dfflr #(1) ret_ptr_**_dfflrs(ret_ptr_**_ena, ret_ptr_**_nxt, ret_ptr_**_r, clk, rst_n);
wire [`E203_ITAG_WIDTH-1:0] ret_ptr_nxt;
assign ret_ptr_nxt = ret_ptr_**_ena ? `E203_ITAG_WIDTH'b0 : (ret_ptr_r + 1'b1);
sirv_gnrl_dfflr #(`E203_ITAG_WIDTH) ret_ptr_dfflrs(ret_ptr_ena, ret_ptr_nxt, ret_ptr_r, clk, rst_n);
assign oitf_empty = (ret_ptr_r == alc_ptr_r) & (ret_ptr_**_r == alc_ptr_**_r);
assign oitf_full = (ret_ptr_r == alc_ptr_r) & (~(ret_ptr_**_r == alc_ptr_**_r));
end//}
else begin: depth_eq1//}{
assign alc_ptr_r =1'b0;
assign ret_ptr_r =1'b0;
assign oitf_empty = ~vld_r[0];
assign oitf_full = vld_r[0];
end//}
endgenerate//}
assign ret_ptr = ret_ptr_r;
assign dis_ptr = alc_ptr_r;
// If the OITF is not full, or it is under retiring, then it is ready to accept new dispatch
assign dis_ready = (~oitf_full) | ret_ena;
// To cut down the loop between ALU write-back valid --> oitf_ret_ena --> oitf_ready ---> dispatch_ready --- > alu_i_valid
// we exclude the ret_ena from the ready signal
assign dis_ready = (~oitf_full);
wire [`E203_OITF_DEPTH-1:0] rd_match_rs1idx;
wire [`E203_OITF_DEPTH-1:0] rd_match_rs2idx;
wire [`E203_OITF_DEPTH-1:0] rd_match_rs3idx;
wire [`E203_OITF_DEPTH-1:0] rd_match_rdidx;
genvar i;
generate //{
for (i=0; i<`E203_OITF_DEPTH; i=i+1) begin:oitf_entries//{
assign vld_set[i] = alc_ptr_ena & (alc_ptr_r == i);
assign vld_clr[i] = ret_ptr_ena & (ret_ptr_r == i);
assign vld_ena[i] = vld_set[i] | vld_clr[i];
assign vld_nxt[i] = vld_set[i] | (~vld_clr[i]);
sirv_gnrl_dfflr #(1) vld_dfflrs(vld_ena[i], vld_nxt[i], vld_r[i], clk, rst_n);
//Payload only set, no need to clear
sirv_gnrl_dffl #(`E203_RFIDX_WIDTH) rdidx_dfflrs(vld_set[i], disp_i_rdidx, rdidx_r[i], clk);
sirv_gnrl_dffl #(`E203_PC_SIZE ) pc_dfflrs (vld_set[i], disp_i_pc , pc_r[i] , clk);
sirv_gnrl_dffl #(1) rdwen_dfflrs(vld_set[i], disp_i_rdwen, rdwen_r[i], clk);
sirv_gnrl_dffl #(1) rdfpu_dfflrs(vld_set[i], disp_i_rdfpu, rdfpu_r[i], clk);
assign rd_match_rs1idx[i] = vld_r[i] & rdwen_r[i] & disp_i_rs1en & (rdfpu_r[i] == disp_i_rs1fpu) & (rdidx_r[i] == disp_i_rs1idx);
assign rd_match_rs2idx[i] = vld_r[i] & rdwen_r[i] & disp_i_rs2en & (rdfpu_r[i] == disp_i_rs2fpu) & (rdidx_r[i] == disp_i_rs2idx);
assign rd_match_rs3idx[i] = vld_r[i] & rdwen_r[i] & disp_i_rs3en & (rdfpu_r[i] == disp_i_rs3fpu) & (rdidx_r[i] == disp_i_rs3idx);
assign rd_match_rdidx [i] = vld_r[i] & rdwen_r[i] & disp_i_rdwen & (rdfpu_r[i] == disp_i_rdfpu ) & (rdidx_r[i] == disp_i_rdidx );
end//}
endgenerate//}
assign oitfrd_match_disprs1 = |rd_match_rs1idx;
assign oitfrd_match_disprs2 = |rd_match_rs2idx;
assign oitfrd_match_disprs3 = |rd_match_rs3idx;
assign oitfrd_match_disprd = |rd_match_rdidx ;
assign ret_rdidx = rdidx_r[ret_ptr];
assign ret_pc = pc_r [ret_ptr];
assign ret_rdwen = rdwen_r[ret_ptr];
assign ret_rdfpu = rdfpu_r[ret_ptr];
endmodule
/* e203_exu_longwbck */
module e203_exu_longpwbck(
//
// The LSU Write-Back Interface
input lsu_wbck_i_valid, // Handshake valid
output lsu_wbck_i_ready, // Handshake ready
input [`E203_XLEN-1:0] lsu_wbck_i_wdat,
input [`E203_ITAG_WIDTH -1:0] lsu_wbck_i_itag,
input lsu_wbck_i_err , // The error exception generated
input lsu_cmt_i_buserr ,
input [`E203_ADDR_SIZE -1:0] lsu_cmt_i_badaddr,
input lsu_cmt_i_ld,
input lsu_cmt_i_st,
//
// The Long pipe instruction Wback interface to final wbck module
output longp_wbck_o_valid, // Handshake valid
input longp_wbck_o_ready, // Handshake ready
output [`E203_FLEN-1:0] longp_wbck_o_wdat,
output [5-1:0] longp_wbck_o_flags,
output [`E203_RFIDX_WIDTH -1:0] longp_wbck_o_rdidx,
output longp_wbck_o_rdfpu,
//
// The Long pipe instruction Exception interface to commit stage
output longp_excp_o_valid,
input longp_excp_o_ready,
output longp_excp_o_insterr,
output longp_excp_o_ld,
output longp_excp_o_st,
output longp_excp_o_buserr , // The load/store bus-error exception generated
output [`E203_ADDR_SIZE-1:0] longp_excp_o_badaddr,
output [`E203_PC_SIZE -1:0] longp_excp_o_pc,
//
//The itag of toppest entry of OITF
input oitf_empty,
input [`E203_ITAG_WIDTH -1:0] oitf_ret_ptr,
input [`E203_RFIDX_WIDTH-1:0] oitf_ret_rdidx,
input [`E203_PC_SIZE-1:0] oitf_ret_pc,
input oitf_ret_rdwen,
input oitf_ret_rdfpu,
output oitf_ret_ena,
`ifdef E203_HAS_NICE//{
input nice_longp_wbck_i_valid ,
output nice_longp_wbck_i_ready ,
input [`E203_XLEN-1:0] nice_longp_wbck_i_wdat ,
input [`E203_ITAG_WIDTH-1:0] nice_longp_wbck_i_itag ,
input nice_longp_wbck_i_err,
`endif//}
input clk,
input rst_n
);
// The Long-pipe instruction can write-back only when it's itag
// is same as the itag of toppest entry of OITF
wire wbck_ready4lsu = (lsu_wbck_i_itag == oitf_ret_ptr) & (~oitf_empty);
wire wbck_sel_lsu = lsu_wbck_i_valid & wbck_ready4lsu;
`ifdef E203_HAS_NICE//{
wire wbck_ready4nice = (nice_longp_wbck_i_itag == oitf_ret_ptr) & (~oitf_empty);
wire wbck_sel_nice = nice_longp_wbck_i_valid & wbck_ready4nice;
`endif//}
//assign longp_excp_o_ld = wbck_sel_lsu & lsu_cmt_i_ld;
//assign longp_excp_o_st = wbck_sel_lsu & lsu_cmt_i_st;
//assign longp_excp_o_buserr = wbck_sel_lsu & lsu_cmt_i_buserr;
//assign longp_excp_o_badaddr = wbck_sel_lsu ? lsu_cmt_i_badaddr : `E203_ADDR_SIZE'b0;
assign {
longp_excp_o_insterr
,longp_excp_o_ld
,longp_excp_o_st
,longp_excp_o_buserr
,longp_excp_o_badaddr } =
({`E203_ADDR_SIZE+4{wbck_sel_lsu}} &
{
1'b0,
lsu_cmt_i_ld,
lsu_cmt_i_st,
lsu_cmt_i_buserr,
lsu_cmt_i_badaddr
})
;
//
// The Final arbitrated Write-Back Interface
wire wbck_i_ready;
wire wbck_i_valid;
wire [`E203_FLEN-1:0] wbck_i_wdat;
wire [5-1:0] wbck_i_flags;
wire [`E203_RFIDX_WIDTH-1:0] wbck_i_rdidx;
wire [`E203_PC_SIZE-1:0] wbck_i_pc;
wire wbck_i_rdwen;
wire wbck_i_rdfpu;
wire wbck_i_err ;
assign lsu_wbck_i_ready = wbck_ready4lsu & wbck_i_ready;
assign wbck_i_valid = ({1{wbck_sel_lsu}} & lsu_wbck_i_valid)
`ifdef E203_HAS_NICE//{
| ({1{wbck_sel_nice}} & nice_longp_wbck_i_valid)
`endif//}
;
`ifdef E203_FLEN_IS_32 //{
wire [`E203_FLEN-1:0] lsu_wbck_i_wdat_exd = lsu_wbck_i_wdat;
`else//}{
wire [`E203_FLEN-1:0] lsu_wbck_i_wdat_exd = {{`E203_FLEN-`E203_XLEN{1'b0}},lsu_wbck_i_wdat};
`endif//}
`ifdef E203_HAS_NICE//{
wire [`E203_FLEN-1:0] nice_wbck_i_wdat_exd = {{`E203_FLEN-`E203_XLEN{1'b0}},nice_longp_wbck_i_wdat};
`endif//}
assign wbck_i_wdat = ({`E203_FLEN{wbck_sel_lsu}} & lsu_wbck_i_wdat_exd )
`ifdef E203_HAS_NICE//{
| ({`E203_FLEN{wbck_sel_nice}} & nice_wbck_i_wdat_exd )
`endif//}
;
assign wbck_i_flags = 5'b0
;
`ifdef E203_HAS_NICE//{
wire nice_wbck_i_err = nice_longp_wbck_i_err;
`endif//}
assign wbck_i_err = wbck_sel_lsu & lsu_wbck_i_err
;
assign wbck_i_pc = oitf_ret_pc;
assign wbck_i_rdidx = oitf_ret_rdidx;
assign wbck_i_rdwen = oitf_ret_rdwen;
assign wbck_i_rdfpu = oitf_ret_rdfpu;
// If the instruction have no error and it have the rdwen, then it need to
// write back into regfile, otherwise, it does not need to write regfile
wire need_wbck = wbck_i_rdwen & (~wbck_i_err);
// If the long pipe instruction have error result, then it need to handshake
// with the commit module.
wire need_excp = wbck_i_err
`ifdef E203_HAS_NICE//{
& (~ (wbck_sel_nice & nice_wbck_i_err))
`endif//}
;
assign wbck_i_ready =
(need_wbck ? longp_wbck_o_ready : 1'b1)
& (need_excp ? longp_excp_o_ready : 1'b1);
assign longp_wbck_o_valid = need_wbck & wbck_i_valid & (need_excp ? longp_excp_o_ready : 1'b1);
assign longp_excp_o_valid = need_excp & wbck_i_valid & (need_wbck ? longp_wbck_o_ready : 1'b1);
assign longp_wbck_o_wdat = wbck_i_wdat ;
assign longp_wbck_o_flags = wbck_i_flags ;
assign longp_wbck_o_rdfpu = wbck_i_rdfpu ;
assign longp_wbck_o_rdidx = wbck_i_rdidx;
assign longp_excp_o_pc = wbck_i_pc;
assign oitf_ret_ena = wbck_i_valid & wbck_i_ready;
`ifdef E203_HAS_NICE//{
assign nice_longp_wbck_i_ready = wbck_ready4nice & wbck_i_ready;
`endif//}
endmodule
综上所述,蜂鸟E203的执行结构是一种混合的策略:
单周期指令:顺序发射、顺序执行、顺序写回
长指令:顺序发射、乱序执行、顺序写回
所有指令混杂:顺序发射、乱序执行、乱序写回
在其中最核心的思想就是取得“更高的性能-面积比”,这套解决思路还是比较巧妙的
|