该项目为笔者这学期计算机组成原理实验整体内容,按照从最简单的各个模块到简单单周期CPU再到流水线的顺序,供初学者参考,如有问题欢迎与我交流,最后流水线CPU以及对应比特流已上传github:Zhuyh1139/Pipeline-CPU: Implementing Pipelined CPU Based on Verilog and Running on FPGA Platform (github.com)使用方法见下面说明。
这里我默认了有一定的verilog基础,如果有不懂的可以参考这篇文章来大致学习:
verilog菜鸟教程https://www.runoob.com/w3cnote_genre/verilog
简单单周期CPU
好了,废话不多说,我们先来看下要实现一个简单单周期CPU我们需要哪些模块,这个我们在计算机组成原理这门课上都会提到,如下所示:
其中的方框部分是我们要实现的模块,我们先来分析一下流程:最左边为PC相关内容,把当前PC值传入IM模块与MUX模块使用,其中IM表示指令寄存器,我们可以通过PC计算出对应的指令地址,然后得到对应的指令来送进我们的译码器中,然后译码器会得到如imm(立即数),寄存器读写地址,使能信号等相关量,最后通过两个二选一数据选择器实现数据选择传入我们的计算模块ALU当中,最后写会寄存器堆当中。可以看出,这个简单的CPU不涉及内存及跳转的相关指令,只实现了最基本的运算相关的指令,下面我们一起来一一看一下各个模块:
PC寄存器
首先是PC部分,PC32b很简单,就是把32位的PC由pc更新为npc,故我们可以采用带同步复位功能的寄存器单元来实现,这里要注意的是,复位后pc的值并非0而是代码段的起始地址0x00400000,如下所示:
module PC (
input [ 0 : 0] clk,
input [ 0 : 0] rst,
input [ 0 : 0] en,
input [ 0 : 0] stall_pc,
input [31 : 0] npc,
output reg [31 : 0] pc
);
initial begin
pc <= 32'h00400000;
end
always @(posedge clk) begin
if(rst) begin
pc <= 32'h00400000;
end
else begin
if (stall_pc) begin
pc <= pc;
end
else begin
if(en) begin
pc <= npc;
end
end
end
end
endmodule
这里有一个信号量可能会比较迷惑:stall_pc,这个是在流水线CPU中用到的信号,这里不用管,直接删除与之相关的内容即可。
PC-ADD4
这个模块正如其名,就是对PC加4的操作,表示取下一条指令,故非常简单,如下所示:
module pc_adder(
input [31:0] pc,
output reg [31:0] npc
);
always @(*) begin
npc = pc + 32'h4;
end
endmodule
DECODER
译码器,也是我们CPU中最为重要的模块之一,通过输入的指令,解码出各种信息,该模块的编写可参考RISC-V指令集架构来写。比如RISC-V指令的读写寄存器位置都是固定的,可以直接截取获得,其余的如下所示,我们来一一讲解:
module DECODE (
input [31 : 0] inst,
output reg [ 4 : 0] alu_op,
output reg [31 : 0] imm,
output reg [ 4 : 0] rf_ra0,
output reg [ 4 : 0] rf_ra1,
output reg [ 4 : 0] rf_wa,
output reg [ 0 : 0] rf_we,
output reg [ 0 : 0] alu_src0_sel,
output reg [ 0 : 0] alu_src1_sel
);
always @(*)begin
//add
if(inst[6:0] == 7'b0110011 && inst[31:25] == 7'b0000000 && inst[14:12] == 3'b000)begin
alu_op = 5'b00000;
imm = 0;
rf_ra1 = inst[24:20];
rf_ra0 = inst[19:15];
rf_wa = inst[11:7];
rf_we = 1;
alu_src0_sel = 0;
alu_src1_sel = 0;
end
//addi
if(inst[6:0] == 7'b0010011 && inst[14:12] == 3'b000)begin
alu_op = 5'b00000;
imm = {{20{inst[31]}},inst[31:20]};
rf_ra1 = inst[24:20];
rf_ra0 = inst[19:15];
rf_wa = inst[11:7];
rf_we = 1;
alu_src0_sel = 0;
alu_src1_sel = 1;
end
//sub
if(inst[6:0] == 7'b0110011 && inst[31:25] == 7'b0100000 && inst[14:12] == 3'b000)begin
alu_op = 5'b00010;
imm = 0;
rf_ra1 = inst[24:20];
rf_ra0 = inst[19:15];
rf_wa = inst[11:7];
rf_we = 1;
alu_src0_sel = 0;
alu_src1_sel = 0;
end
//slt
if(inst[6:0] == 7'b0110011 && inst[31:25] == 7'b0000000 && inst[14:12] == 3'b010)begin
alu_op = 5'b00100;
imm = 0;
rf_ra1 = inst[24:20];
rf_ra0 = inst[19:15];
rf_wa = inst[11:7];
rf_we = 1;
alu_src0_sel = 0;
alu_src1_sel = 0;
end
//sltu
if(inst[6:0] == 7'b0110011 && inst[31:25] == 7'b0000000 && inst[14:12] == 3'b011)begin
alu_op = 5'b00101;
imm = 0;
rf_ra1 = inst[24:20];
rf_ra0 = inst[19:15];
rf_wa = inst[11:7];
rf_we = 1;
alu_src0_sel = 0;
alu_src1_sel = 0;
end
//and
if(inst[6:0] == 7'b0110011 && inst[31:25] == 7'b0000000 && inst[14:12] == 3'b111)begin
alu_op = 5'b01001;
imm = 0;
rf_ra1 = inst[24:20];
rf_ra0 = inst[19:15];
rf_wa = inst[11:7];
rf_we = 1;
alu_src0_sel = 0;
alu_src1_sel = 0;
end
//or
if(inst[6:0] == 7'b0110011 && inst[31:25] == 7'b0000000 && inst[14:12] == 3'b110)begin
alu_op = 5'b01010;
imm = 0;
rf_ra1 = inst[24:20];
rf_ra0 = inst[19:15];
rf_wa = inst[11:7];
rf_we = 1;
alu_src0_sel = 0;
alu_src1_sel = 0;
end
//xor
if(inst[6:0] == 7'b0110011 && inst[31:25] == 7'b0000000 && inst[14:12] == 3'b100)begin
alu_op = 5'b01011;
imm = 0;
rf_ra1 = inst[24:20];
rf_ra0 = inst[19:15];
rf_wa = inst[11:7];
rf_we = 1;
alu_src0_sel = 0;
alu_src1_sel = 0;
end
//sll
if(inst[6:0] == 7'b0110011 && inst[31:25] == 7'b0000000 && inst[14:12] == 3'b001)begin
alu_op = 5'b01110;
imm = 0;
rf_ra1 = inst[24:20];
rf_ra0 = inst[19:15];
rf_wa = inst[11:7];
rf_we = 1;
alu_src0_sel = 0;
alu_src1_sel = 0;
end
//srl
if(inst[6:0] == 7'b0110011 && inst[31:25] == 7'b0000000 && inst[14:12] == 3'b101)begin
alu_op = 5'b01111;
imm = 0;
rf_ra1 = inst[24:20];
rf_ra0 = inst[19:15];
rf_wa = inst[11:7];
rf_we = 1;
alu_src0_sel = 0;
alu_src1_sel = 0;
end
//sra
if(inst[6:0] == 7'b0110011 && inst[31:25] == 7'b0100000 && inst[14:12] == 3'b101)begin
alu_op = 5'b10000;
imm = 0;
rf_ra1 = inst[24:20];
rf_ra0 = inst[19:15];
rf_wa = inst[11:7];
rf_we = 1;
alu_src0_sel = 0;
alu_src1_sel = 0;
end
//slli
if(inst[6:0] == 7'b0010011 && inst[31:25] == 7'b0000000 && inst[14:12] == 3'b001)begin
alu_op = 5'b01110;
imm = {{27{1'b0}},inst[24:20]};
rf_ra1 = inst[24:20];
rf_ra0 = inst[19:15];
rf_wa = inst[11:7];
rf_we = 1;
alu_src0_sel = 0;
alu_src1_sel = 1;
end
//srli
if(inst[6:0] == 7'b0010011 && inst[31:25] == 7'b0000000 && inst[14:12] == 3'b101)begin
alu_op = 5'b01111;
imm = {{27{1'b0}},inst[24:20]};
rf_ra1 = inst[24:20];
rf_ra0 = inst[19:15];
rf_wa = inst[11:7];
rf_we = 1;
alu_src0_sel = 0;
alu_src1_sel = 1;
end
//srai
if(inst[6:0] == 7'b0010011 && inst[31:25] == 7'b0100000 && inst[14:12] == 3'b101)begin
alu_op = 5'b10000;
imm = {{27{1'b0}},inst[24:20]};
rf_ra1 = inst[24:20];
rf_ra0 = inst[19:15];
rf_wa = inst[11:7];
rf_we = 1;
alu_src0_sel = 0;
alu_src1_sel = 1;
end
//slti
if(inst[6:0] == 7'b0010011 && inst[14:12] == 3'b010)begin
alu_op = 5'b00100;
imm = {{20{inst[31]}},inst[31:20]};
rf_ra1 = inst[24:20];
rf_ra0 = inst[19:15];
rf_wa = inst[11:7];
rf_we = 1;
alu_src0_sel = 0;
alu_src1_sel = 1;
end
//sltiu
if(inst[6:0] == 7'b0010011 && inst[14:12] == 3'b011)begin
alu_op = 5'b00101;
imm = {{20{inst[31]}},inst[31:20]};
rf_ra1 = inst[24:20];
rf_ra0 = inst[19:15];
rf_wa = inst[11:7];
rf_we = 1;
alu_src0_sel = 0;
alu_src1_sel = 1;
end
//andi
if(inst[6:0] == 7'b0010011 && inst[14:12] == 3'b111)begin
alu_op = 5'b01001;
imm = {{20{inst[31]}},inst[31:20]};
rf_ra1 = inst[24:20];
rf_ra0 = inst[19:15];
rf_wa = inst[11:7];
rf_we = 1;
alu_src0_sel = 0;
alu_src1_sel = 1;
end
//ori
if(inst[6:0] == 7'b0010011 && inst[14:12] == 3'b110)begin
alu_op = 5'b01010;
imm = {{20{inst[31]}},inst[31:20]};
rf_ra1 = inst[24:20];
rf_ra0 = inst[19:15];
rf_wa = inst[11:7];
rf_we = 1;
alu_src0_sel = 0;
alu_src1_sel = 1;
end
//xori
if(inst[6:0] == 7'b0010011 && inst[14:12] == 3'b100)begin
alu_op = 5'b01011;
imm = {{20{inst[31]}},inst[31:20]};
rf_ra1 = inst[24:20];
rf_ra0 = inst[19:15];
rf_wa = inst[11:7];
rf_we = 1;
alu_src0_sel = 0;
alu_src1_sel = 1;
end
//lui
if(inst[6:0] == 7'b0110111)begin
alu_op = 5'b00000;
imm = {{inst[31:12]},{12{1'b0}}};
rf_ra0 = 0;
rf_ra1 = inst[19:15];
rf_wa = inst[11:7];
rf_we = 1;
alu_src0_sel = 0;
alu_src1_sel = 1;
end
//auipc
if(inst[6:0] == 7'b0010111)begin
alu_op = 5'b00000;
imm = {{inst[31:12]},{12{1'b0}}};
rf_ra0 = inst[24:20];
rf_ra1 = inst[19:15];
rf_wa = inst[11:7];
rf_we = 1;
alu_src0_sel = 1;
alu_src1_sel = 1;
end
//ebreak
if(inst == 32'h00100073)begin
alu_op = 5'b11111;
imm = 0;
rf_ra0 = 0;
rf_ra1 = 0;
rf_wa = 0;
rf_we = 0;
alu_src0_sel = 0;
alu_src1_sel = 0;
end
end
endmodule
下面我们来说明一下:alu_op是我们传入ALU的信号,表示运算类型,如加法,比大小之类的,具体的在ALU模块有所说明;imm是立即数,表示传到ALU的数据;rf_ra0,rf_ra1表示读寄存器的地址;rf_wa表示写寄存器地址;rf_we表示写使能信号;alu_src0_sel与alu_src1_sel分别表示两个数据选择器的选择信号。同时注意到我们在最后加上了一个ebreak指令,这个什么也不做的指令表示程序的终止。
MUX2
mux2表示二选一数据选择器,这里使用两个是因为有的指令操作是对寄存器操作,如add,有的是对寄存器与立即数进行操作,如addi,还有的是对当前PC进行操作,如auipc。MUX2的实现原理十分简单,只需一个三目运算符即可:
module MUX1 # (
parameter WIDTH = 32
)(
input [WIDTH-1 : 0] src0, src1,
input [ 0 : 0] sel,
output [WIDTH-1 : 0] res
);
assign res = sel ? src1 : src0;
endmodule
REGFILES
寄存器堆,是我们CPU当中的核心储存数据的地方,这里简单单周期CPU不需要考虑读写优先问题,后面流水线需要用到的时候会加以说明。寄存器堆的实现相比对学过verilog的人来说都不算难,也应该都写过,这里不做赘述,直接上代码:
module REG_FILE (
input [ 0 : 0] clk,
input [ 4 : 0] rf_ra0,
input [ 4 : 0] rf_ra1,
input [ 4 : 0] rf_wa,
input [ 0 : 0] rf_we,
input [31 : 0] rf_wd,
input [4 : 0] debug_reg_ra,
output [31:0] debug_reg_rd,
output [31 : 0] rf_rd0,
output [31 : 0] rf_rd1
);
reg [31 : 0] reg_file [0 : 31];
integer i;
initial begin
for (i = 0; i < 32; i = i + 1)
reg_file[i] = 0;
end
always @(posedge clk) begin
if(rf_we)begin
if(rf_wa !=0 )begin
reg_file[rf_wa] <= rf_wd;
end
end
end
assign rf_rd0 = reg_file[rf_ra0];
assign rf_rd1 = reg_file[rf_ra1];
assign debug_reg_rd = reg_file[debug_reg_ra];
endmodule
ALU
我们的运算模块,要实现如add,srl等功能,如下所示,直接使用verilog自带的运算符即可,当然也可以自己通过组合逻辑实现(不推荐,门电路太多增加能耗且极其费时):
module ALU (
input [31 : 0] alu_src0,
input [31 : 0] alu_src1,
input [ 4 : 0] alu_op,
output reg [31 : 0] alu_res
);
`define ADD 5'B00000
`define SUB 5'B00010
`define SLT 5'B00100
`define SLTU 5'B00101
`define AND 5'B01001
`define OR 5'B01010
`define XOR 5'B01011
`define SLL 5'B01110
`define SRL 5'B01111
`define SRA 5'B10000
`define SRC0 5'B10001
`define SRC1 5'B10010
always @(*) begin
case(alu_op)
`ADD:
alu_res = alu_src0 + alu_src1;
`SUB:
alu_res = alu_src0 - alu_src1;
`SLT:
alu_res = ($signed(alu_src0) < $signed(alu_src1));
`SLTU:
alu_res = (alu_src0 < alu_src1);
`AND:
alu_res = alu_src0 & alu_src1;
`OR:
alu_res = alu_src0 | alu_src1;
`XOR:
alu_res = alu_src0 ^ alu_src1;
`SLL:
alu_res = alu_src0 << alu_src1[4:0];
`SRL:
alu_res = alu_src0 >> alu_src1[4:0];
`SRA:
alu_res = $signed(alu_src0) >>> alu_src1[4:0];
`SRC0:
alu_res = alu_src0;
`SRC1:
alu_res = alu_src1;
default :
alu_res = 32'H0;
endcase
end
endmodule
这里就把各个运算与前面的译码器译码出的alu_op对应起来了。
IM
这个模块比较特殊,叫做指令寄存器,这里我们采用例化ip核的方式实现,因为这样可以直接通过导入ceo文件实现指令的传入。但也要注意,代码上板运行框架(我们助教给的)中用到了数据寄存器,所以最好也把数据寄存器通过ip核提前例化,如下图所示:
这里要注意名字不要弄错。
上面的工作完成后,我们就可以通过数据通路来进行连接了,按照上图连接各个模块即可,如下所示:
`include "./include/config.v"
module CPU (
input [ 0 : 0] clk,
input [ 0 : 0] rst,
input [ 0 : 0] global_en,
/* ------------------------------ Memory (inst) ----------------------------- */
output [31 : 0] imem_raddr,
input [31 : 0] imem_rdata,
/* ------------------------------ Memory (data) ----------------------------- */
input [31 : 0] dmem_rdata, // Unused
output [ 0 : 0] dmem_we, // Unused
output [31 : 0] dmem_addr, // Unused
output [31 : 0] dmem_wdata, // Unused
/* ---------------------------------- Debug --------------------------------- */
output [ 0 : 0] commit,
output [31 : 0] commit_pc,
output [31 : 0] commit_inst,
output [ 0 : 0] commit_halt,
output [ 0 : 0] commit_reg_we,
output [ 4 : 0] commit_reg_wa,
output [31 : 0] commit_reg_wd,
output [ 0 : 0] commit_dmem_we,
output [31 : 0] commit_dmem_wa,
output [31 : 0] commit_dmem_wd,
input [ 4 : 0] debug_reg_ra,
output [31 : 0] debug_reg_rd
);
wire [31:0] cur_npc,cur_pc;
wire [31:0] cur_inst;
wire [4:0] cur_op;
wire [31:0] cur_imm;
wire [31:0] rf_ra0,rf_ra1;
wire [31:0] rf_wa;
wire [0:0] rf_we;
wire [0:0] alu_src0_sel,alu_src1_sel;
wire [31:0] data_r0,data_r1,res0,res1,alu_res;
PC my_pc (
.clk (clk ),
.rst (rst ),
.en (global_en ), // 当 global_en 为高电平时,PC 才会更新,CPU 才会执行指令。
.npc (cur_npc ),
.pc (cur_pc )
);
pc_adder my_adder(
.pc(cur_pc),
.npc(cur_npc)
);
IM cpu_im(
.clk(clk),
.we(global_en),
.ins_d(cur_pc),
.inst(cur_inst)
);
DECODE my_decoder(
.inst(cur_inst),
.alu_op(cur_op),
.imm(cur_imm),
.rf_ra0(rf_ra0),
.rf_ra1(rf_ra1),
.rf_wa(rf_wa),
.rf_we(rf_we),
.alu_src0_sel(alu_src0_sel),
.alu_src1_sel(alu_src1_sel)
);
MUX1 my_mux0(
.src0(data_r0),
.src1(cur_pc),
.sel(alu_src0_sel),
.res(res0)
);
MUX1 my_mux1(
.src0(data_r1),
.src1(cur_imm),
.sel(alu_src1_sel),
.res(res1)
);
ALU my_alu(
.alu_src0(res0),
.alu_src1(res1),
.alu_op(cur_op),
.alu_res(alu_res)
);
REG_FILE my_reg(
.clk(clk),
.rf_ra0(rf_ra0),
.rf_ra1(rf_ra1),
.rf_wa(rf_wa),
.rf_we(rf_we),
.rf_wd(alu_res),
.debug_reg_ra(debug_reg_ra),
.debug_reg_rd(debug_reg_rd),
.rf_rd0(data_r0),
.rf_rd1(data_r1)
);
/* -------------------------------------------------------------------------- */
/* Commit */
/* -------------------------------------------------------------------------- */
wire [ 0 : 0] commit_if ;
assign commit_if = 1'H1;
reg [ 0 : 0] commit_reg ;
reg [31 : 0] commit_pc_reg ;
reg [31 : 0] commit_inst_reg ;
reg [ 0 : 0] commit_halt_reg ;
reg [ 0 : 0] commit_reg_we_reg ;
reg [ 4 : 0] commit_reg_wa_reg ;
reg [31 : 0] commit_reg_wd_reg ;
reg [ 0 : 0] commit_dmem_we_reg ;
reg [31 : 0] commit_dmem_wa_reg ;
reg [31 : 0] commit_dmem_wd_reg ;
always @(posedge clk) begin
if (rst) begin
commit_reg <= 1'H0;
commit_pc_reg <= 32'H0;
commit_inst_reg <= 32'H0;
commit_halt_reg <= 1'H0;
commit_reg_we_reg <= 1'H0;
commit_reg_wa_reg <= 5'H0;
commit_reg_wd_reg <= 32'H0;
commit_dmem_we_reg <= 1'H0;
commit_dmem_wa_reg <= 32'H0;
commit_dmem_wd_reg <= 32'H0;
end
else if (global_en) begin
commit_reg <= commit_if;
commit_pc_reg <= cur_npc; // TODO
commit_inst_reg <= cur_inst; // TODO
commit_halt_reg <= (cur_inst == 32'h00100073); // TODO
commit_reg_we_reg <= rf_we; // TODO
commit_reg_wa_reg <= rf_wa; // TODO
commit_reg_wd_reg <= alu_res; // TODO
commit_dmem_we_reg <= 0;
commit_dmem_wa_reg <= 0;
commit_dmem_wd_reg <= 0;
end
end
assign commit = commit_reg;
assign commit_pc = commit_pc_reg;
assign commit_inst = commit_inst_reg;
assign commit_halt = commit_halt_reg;
assign commit_reg_we = commit_reg_we_reg;
assign commit_reg_wa = commit_reg_wa_reg;
assign commit_reg_wd = commit_reg_wd_reg;
assign commit_dmem_we = commit_dmem_we_reg;
assign commit_dmem_wa = commit_dmem_wa_reg;
assign commit_dmem_wd = commit_dmem_wd_reg;
endmodule
诶,这里可能就要问了,怎么突然冒出来这么多信号,不用慌,这些commit信号都是给仿真框架使用的,如果我们直接使用仿真,只需初始化一个时钟信号和使能信号传到PC模块即可使用(注意:这里的IM模块是我自己写的一个模块,并非例化的ip核,其实直接使用就可以了,我是自己又写了个例化IP核的模块,实际完全没必要,如下所示)
module IM(
input clk,
input we,
input [31:0] ins_d,
output [31:0] inst
);
reg [8:0] addr;
always @(*) begin
addr = (ins_d - 32'h00400000) / 4;
end
INST_MEM my_im(
.clk(clk),
.a(addr),
.d(ins_d),
.we(0),
.spo(inst)
);
endmodule
至此,我们就完全实现了简单单周期CPU,确认仿真无误后烧写比特流,其中的top文件由助教提供,可参考前面github链接中内容,在FPGAOL(我们学校线上平台)上运行如下:
其中使用的是助教提供的测试文件,和RARS中结果比较是一致的,由此,我们就正式初步上路啦!
哦对了,关于上板后如何运行,参考这个:
PDU使用教程https://soc.ustc.edu.cn/COD/lab3/PDU_intro/
完整单周期CPU
下面让我们来实现一个具有完整功能的单周期CPU,我们前面已经实现的CPU不能够进行访存和指令跳转功能,我们下面来完善它,如下所示,是完整单周期CPU的数据通路:
好了让我们来看下和之前的简单单周期CPU有何区别,可以看出,在PC部分加上了PCMUX,这是因为下一次更新PC就有可能不是+4了,而是指令跳转的地址了,同时我们增加了branch跳转模块,这个我们后面详细讲,还有访存单元SL_UNIT和四选一数据选择器,那他们都是用来做什么的呢,下面我们来一一介绍。
PCMUX
如前面所讲,我们需要一个选择器来选择下次更新PC的值,这里要注意对应关系,而选择信号npc_sel则由我们的跳转模块branch来产生,下面是具体代码:
module npc_mux(
input [31:0] pc_add4,
input [31:0] pc_offset,
input [31:0] pc_j,
input [1:0] npc_sel,
output [31:0] npc
);
reg [31:0] npc_reg;
always @(*) begin
case (npc_sel)
2'b00:npc_reg = pc_add4;
2'b01:npc_reg = pc_offset;
2'b10:npc_reg = pc_j;
default: npc_reg = pc_add4;
endcase
end
assign npc = npc_reg;
endmodule
BRANCH
这个branch模块是我们实现跳转的核心所在,想要实现跳转,且是不同类型的跳转,我们首先肯定要从译码器中得到这条指令,然后传入对应的信号,这里是br_type,表示不同的跳转类型,然后进行判断是否要进行跳转,那如何判断呢?注意到我们的数据通路中,将传入alu的两个操作数也同时传到了branch当中,因此,我们直接在这里面进行判断即可,如下所示,和前面的PCMUX结合起来即可:
module BRANCH(
input [ 3 : 0] br_type,
input signed [31 : 0] br_src0,
input signed [31 : 0] br_src1,
output reg [ 1 : 0] npc_sel
);
`define BEQ 4'B0000
`define BNE 4'B0001
`define BLT 4'B0010
`define BGE 4'B0100
`define BLTU 4'B0011
`define BGEU 4'B0110
`define JAL 4'B1000
`define JALR 4'B1001
always @(*) begin
case (br_type)
`BEQ: npc_sel = (br_src0 == br_src1) ? 2'b01 : 2'b00;
`BNE: npc_sel = (br_src0 != br_src1) ? 2'b01 : 2'b00;
`BLT: npc_sel = ((br_src0) < (br_src1)) ? 2'b01 : 2'b00;
`BGE: npc_sel = (br_src0 >= br_src1) ? 2'b01 : 2'b00;
`BLTU: npc_sel = ($signed(br_src0) < $signed(br_src1)) ? 2'b01 : 2'b00;
`BGEU: npc_sel = ($signed(br_src0) >= $signed(br_src1)) ? 2'b01 : 2'b00;
`JAL:npc_sel = 2'b01;
`JALR:npc_sel = 2'b10;
default: npc_sel = 2'b00;
endcase
end
endmodule
SL_UNIT
这个模块是访存控制单元,即实现对数据储存器的管理,是读还是写之类的问题,要注意的是读的数据可能是半字之类的情况,我们需要对相应数据进行处理,我们直接上代码来说:
module SLU (
input [31 : 0] addr,
input [ 3 : 0] dmem_access,
input [31 : 0] rd_in,
input [31 : 0] wd_in,
output reg [31 : 0] rd_out,
output reg [31 : 0] wd_out,
output reg [0:0] wd_we
);
`define LW 4'B0000
`define LH 4'B0001
`define LB 4'B0010
`define LHU 4'B0100
`define LBU 4'B0011
`define SW 4'B1000
`define SH 4'B1001
`define SB 4'B1011
always @(*) begin
case (dmem_access)
`LW:begin
rd_out = rd_in;
wd_out = 0;
wd_we = 0;
end
`LH:begin
if(addr % 4 == 0)begin
rd_out = {{16{rd_in[15]}},rd_in[15:0]};
end
if(addr % 4 == 1)begin
rd_out = {{16{rd_in[23]}},rd_in[23:8]};
end
if(addr % 4 == 2)begin
rd_out = {{16{rd_in[31]}},rd_in[31:16]};
end
wd_out = 0;
wd_we = 0;
end
`LB:begin
if(addr % 4 == 0)begin
rd_out = {{24{rd_in[7]}},rd_in[7:0]};
end
if(addr % 4 == 1)begin
rd_out = {{24{rd_in[15]}},rd_in[15:8]};
end
if(addr % 4 == 2)begin
rd_out = {{24{rd_in[23]}},rd_in[23:16]};
end
if(addr % 4 == 3)begin
rd_out = {{24{rd_in[31]}},rd_in[31:24]};
end
wd_out = 0;
wd_we = 0;
end
`LHU:begin
if(addr % 4 == 0)begin
rd_out = {{16{1'b0}},rd_in[15:0]};
end
if(addr % 4 == 1)begin
rd_out = {{16{1'b0}},rd_in[23:8]};
end
if(addr % 4 == 2)begin
rd_out = {{16{1'b0}},rd_in[31:16]};
end
wd_out = 0;
wd_we = 0;
end
`LBU:begin
if(addr % 4 == 0)begin
rd_out = {{24{1'b0}},rd_in[7:0]};
end
if(addr % 4 == 1)begin
rd_out = {{24{1'b0}},rd_in[15:8]};
end
if(addr % 4 == 2)begin
rd_out = {{24{1'b0}},rd_in[23:16]};
end
if(addr % 4 == 3)begin
rd_out = {{24{1'b0}},rd_in[31:24]};
end
wd_out = 0;
wd_we = 0;
end
`SW:begin
rd_out = 0;
wd_out = wd_in;
wd_we = 1;
end
`SH:begin
rd_out = 0;
if (addr % 4 == 0) begin
wd_out = {rd_in[31:16],wd_in[15:0]};
end
if (addr % 4 == 1) begin
wd_out = {rd_in[31:24],wd_in[15:0],rd_in[7:0]};
end
if (addr % 4 == 2) begin
wd_out = {wd_in[15:0],rd_in[15:0]};
end
wd_we = 1;
end
`SB:begin
rd_out = 0;
if (addr % 4 == 0) begin
wd_out = {rd_in[31:8],wd_in[7:0]};
end
if (addr % 4 == 1) begin
wd_out = {rd_in[31:16],wd_in[7:0],rd_in[7:0]};
end
if (addr % 4 == 2) begin
wd_out = {rd_in[31:24],wd_in[7:0],rd_in[15:0]};
end
if (addr % 4 == 3) begin
wd_out = {wd_in[7:0],rd_in[23:0]};
end
wd_we = 1;
end
default: begin
rd_out = 0;
wd_out = 0;
wd_we = 0;
end
endcase
end
endmodule
这里的addr是地址,来自ALU,表示已经处理过的数据,也是我们预期想要的内存位置,dmem_access表示访存类型,即是lw还是lb之类的,这里通过对地址模四得到字偏移量,然后对相应的输入输出进行操作即可。
寄存器写回选择器
这里所谓写回选择器,其实无非就是一个简单的四选一数据选择器罢了,根据实际需要选择数据写入我们的寄存器堆即可,代码如下:
module MUX2 # (
parameter WIDTH = 32
)(
input [WIDTH-1 : 0] src0, src1, src2, src3,
input [ 1 : 0] sel,
output [WIDTH-1 : 0] res
);
assign res = sel[1] ? (sel[0] ? src3 : src2) : (sel[0] ? src1 : src0);
endmodule
//sel:00 01 10 11;
//res:src0 src1 src2 src3;
// pc_add4 alu_res dmem_data 0;
修改后的DECODER
诶,注意到没,我们前面说的如branch等模块的控制信号都是根据我们输入的指令来决定的,由此,我们需要更新decoder,让它能生成我们想要的br_type,dmem_access和mux2的sel等,同时我们也要增加新的指令了,如下所示:
module DECODER (
input [31 : 0] inst,
output reg [ 4 : 0] alu_op,
output [ 3 : 0] dmem_access,
output reg [31 : 0] imm,
output [ 4 : 0] rf_ra0,
output [ 4 : 0] rf_ra1,
output [ 4 : 0] rf_wa,
output [ 0 : 0] rf_we,
output reg [ 1 : 0] rf_wd_sel,
output [ 0 : 0] alu_src0_sel,
output [ 0 : 0] alu_src1_sel,
output [ 3 : 0] br_type
);
reg [3:0] dmem_access_reg,br_type_reg;
reg [4:0] rf_ra0_reg,rf_ra1_reg,rf_wa_reg;
reg [0:0] rf_we_reg,alu_src0_sel_reg,alu_src1_sel_reg;
always @(*)begin
//add
if(inst[6:0] == 7'b0110011 && inst[31:25] == 7'b0000000 && inst[14:12] == 3'b000)begin
alu_op = 5'b00000;
dmem_access_reg = 4'b1111;
imm = 0;
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 1;
rf_wd_sel = 2'b01;
alu_src0_sel_reg = 0;
alu_src1_sel_reg = 0;
br_type_reg = 4'b1111;
end
//addi
if(inst[6:0] == 7'b0010011 && inst[14:12] == 3'b000)begin
alu_op = 5'b00000;
dmem_access_reg = 4'b1111;
imm = {{20{inst[31]}},inst[31:20]};
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 1;
rf_wd_sel = 2'b01;
alu_src0_sel_reg = 0;
alu_src1_sel_reg = 1;
br_type_reg = 4'b1111;
end
//sub
if(inst[6:0] == 7'b0110011 && inst[31:25] == 7'b0100000 && inst[14:12] == 3'b000)begin
alu_op = 5'b00010;
dmem_access_reg = 4'b1111;
imm = 0;
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 1;
rf_wd_sel = 2'b01;
alu_src0_sel_reg = 0;
alu_src1_sel_reg = 0;
br_type_reg = 4'b1111;
end
//slt
if(inst[6:0] == 7'b0110011 && inst[31:25] == 7'b0000000 && inst[14:12] == 3'b010)begin
alu_op = 5'b00100;
dmem_access_reg = 4'b1111;
imm = 0;
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 1;
rf_wd_sel = 2'b01;
alu_src0_sel_reg = 0;
alu_src1_sel_reg = 0;
br_type_reg = 4'b1111;
end
//sltu
if(inst[6:0] == 7'b0110011 && inst[31:25] == 7'b0000000 && inst[14:12] == 3'b011)begin
alu_op = 5'b00101;
dmem_access_reg = 4'b1111;
imm = 0;
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 1;
rf_wd_sel = 2'b01;
alu_src0_sel_reg = 0;
alu_src1_sel_reg = 0;
br_type_reg = 4'b1111;
end
//and
if(inst[6:0] == 7'b0110011 && inst[31:25] == 7'b0000000 && inst[14:12] == 3'b111)begin
alu_op = 5'b01001;
dmem_access_reg = 4'b1111;
imm = 0;
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 1;
rf_wd_sel = 2'b01;
alu_src0_sel_reg = 0;
alu_src1_sel_reg = 0;
br_type_reg = 4'b1111;
end
//or
if(inst[6:0] == 7'b0110011 && inst[31:25] == 7'b0000000 && inst[14:12] == 3'b110)begin
alu_op = 5'b01010;
dmem_access_reg = 4'b1111;
imm = 0;
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 1;
rf_wd_sel = 2'b01;
alu_src0_sel_reg = 0;
alu_src1_sel_reg = 0;
br_type_reg = 4'b1111;
end
//xor
if(inst[6:0] == 7'b0110011 && inst[31:25] == 7'b0000000 && inst[14:12] == 3'b100)begin
alu_op = 5'b01011;
dmem_access_reg = 4'b1111;
imm = 0;
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 1;
rf_wd_sel = 2'b01;
alu_src0_sel_reg = 0;
alu_src1_sel_reg = 0;
br_type_reg = 4'b1111;
end
//sll
if(inst[6:0] == 7'b0110011 && inst[31:25] == 7'b0000000 && inst[14:12] == 3'b001)begin
alu_op = 5'b01110;
dmem_access_reg = 4'b1111;
imm = 0;
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 1;
rf_wd_sel = 2'b01;
alu_src0_sel_reg = 0;
alu_src1_sel_reg = 0;
br_type_reg = 4'b1111;
end
//srl
if(inst[6:0] == 7'b0110011 && inst[31:25] == 7'b0000000 && inst[14:12] == 3'b101)begin
alu_op = 5'b01111;
dmem_access_reg = 4'b1111;
imm = 0;
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 1;
rf_wd_sel = 2'b01;
alu_src0_sel_reg = 0;
alu_src1_sel_reg = 0;
br_type_reg = 4'b1111;
end
//sra
if(inst[6:0] == 7'b0110011 && inst[31:25] == 7'b0100000 && inst[14:12] == 3'b101)begin
alu_op = 5'b10000;
dmem_access_reg = 4'b1111;
imm = 0;
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 1;
rf_wd_sel = 2'b01;
alu_src0_sel_reg = 0;
alu_src1_sel_reg = 0;
br_type_reg = 4'b1111;
end
//slli
if(inst[6:0] == 7'b0010011 && inst[31:25] == 7'b0000000 && inst[14:12] == 3'b001)begin
alu_op = 5'b01110;
dmem_access_reg = 4'b1111;
imm = {{27{1'b0}},inst[24:20]};
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 1;
rf_wd_sel = 2'b01;
alu_src0_sel_reg = 0;
alu_src1_sel_reg = 1;
br_type_reg = 4'b1111;
end
//srli
if(inst[6:0] == 7'b0010011 && inst[31:25] == 7'b0000000 && inst[14:12] == 3'b101)begin
alu_op = 5'b01111;
dmem_access_reg = 4'b1111;
imm = {{27{1'b0}},inst[24:20]};
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 1;
rf_wd_sel = 2'b01;
alu_src0_sel_reg = 0;
alu_src1_sel_reg = 1;
br_type_reg = 4'b1111;
end
//srai
if(inst[6:0] == 7'b0010011 && inst[31:25] == 7'b0100000 && inst[14:12] == 3'b101)begin
alu_op = 5'b10000;
dmem_access_reg = 4'b1111;
imm = {{27{1'b0}},inst[24:20]};
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 1;
rf_wd_sel = 2'b01;
alu_src0_sel_reg = 0;
alu_src1_sel_reg = 1;
br_type_reg = 4'b1111;
end
//slti
if(inst[6:0] == 7'b0010011 && inst[14:12] == 3'b010)begin
alu_op = 5'b00100;
dmem_access_reg = 4'b1111;
imm = {{20{inst[31]}},inst[31:20]};
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 1;
rf_wd_sel = 2'b01;
alu_src0_sel_reg = 0;
alu_src1_sel_reg = 1;
br_type_reg = 4'b1111;
end
//sltiu
if(inst[6:0] == 7'b0010011 && inst[14:12] == 3'b011)begin
alu_op = 5'b00101;
dmem_access_reg = 4'b1111;
imm = {{20{inst[31]}},inst[31:20]};
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 1;
rf_wd_sel = 2'b01;
alu_src0_sel_reg = 0;
alu_src1_sel_reg = 1;
br_type_reg = 4'b1111;
end
//andi
if(inst[6:0] == 7'b0010011 && inst[14:12] == 3'b111)begin
alu_op = 5'b01001;
dmem_access_reg = 4'b1111;
imm = {{20{inst[31]}},inst[31:20]};
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 1;
rf_wd_sel = 2'b01;
alu_src0_sel_reg = 0;
alu_src1_sel_reg = 1;
br_type_reg = 4'b1111;
end
//ori
if(inst[6:0] == 7'b0010011 && inst[14:12] == 3'b110)begin
alu_op = 5'b01010;
dmem_access_reg = 4'b1111;
imm = {{20{inst[31]}},inst[31:20]};
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 1;
rf_wd_sel = 2'b01;
alu_src0_sel_reg = 0;
alu_src1_sel_reg = 1;
br_type_reg = 4'b1111;
end
//xori
if(inst[6:0] == 7'b0010011 && inst[14:12] == 3'b100)begin
alu_op = 5'b01011;
dmem_access_reg = 4'b1111;
imm = {{20{inst[31]}},inst[31:20]};
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 1;
rf_wd_sel = 2'b01;
alu_src0_sel_reg = 0;
alu_src1_sel_reg = 1;
br_type_reg = 4'b1111;
end
//lui
if(inst[6:0] == 7'b0110111)begin
alu_op = 5'b00000;
dmem_access_reg = 4'b1111;
imm = {{inst[31:12]},{12{1'b0}}};
rf_ra0_reg = 0;
rf_ra1_reg = 0;
rf_wa_reg = inst[11:7];
rf_we_reg = 1;
rf_wd_sel = 2'b01;
alu_src0_sel_reg = 0;
alu_src1_sel_reg = 1;
br_type_reg = 4'b1111;
end
//auipc
if(inst[6:0] == 7'b0010111)begin
alu_op = 5'b00000;
dmem_access_reg = 4'b1111;
imm = {{inst[31:12]},{12{1'b0}}};
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 1;
rf_wd_sel = 2'b01;
alu_src0_sel_reg = 1;
alu_src1_sel_reg = 1;
br_type_reg = 4'b1111;
end
//lw
if(inst[6:0] == 7'b0000011 && inst[14:12] == 3'b010)begin
alu_op = 5'b00000;
dmem_access_reg = 4'b0000;
imm = {{20{inst[31]}},inst[31:20]};
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 1;
rf_wd_sel = 2'b10;
alu_src0_sel_reg = 0;
alu_src1_sel_reg = 1;
br_type_reg = 4'b1111;
end
//lh
if(inst[6:0] == 7'b0000011 && inst[14:12] == 3'b001)begin
alu_op = 5'b00000;
dmem_access_reg = 4'b0001;
imm = {{20{inst[31]}},inst[31:20]};
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 1;
rf_wd_sel = 2'b10;
alu_src0_sel_reg = 0;
alu_src1_sel_reg = 1;
br_type_reg = 4'b1111;
end
//lb
if(inst[6:0] == 7'b0000011 && inst[14:12] == 3'b000)begin
alu_op = 5'b00000;
dmem_access_reg = 4'b0010;
imm = {{20{inst[31]}},inst[31:20]};
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 1;
rf_wd_sel = 2'b10;
alu_src0_sel_reg = 0;
alu_src1_sel_reg = 1;
br_type_reg = 4'b1111;
end
//lhu
if(inst[6:0] == 7'b0000011 && inst[14:12] == 3'b101)begin
alu_op = 5'b00000;
dmem_access_reg = 4'b0100;
imm = {{20{inst[31]}},inst[31:20]};
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 1;
rf_wd_sel = 2'b10;
alu_src0_sel_reg = 0;
alu_src1_sel_reg = 1;
br_type_reg = 4'b1111;
end
//lbu
if(inst[6:0] == 7'b0000011 && inst[14:12] == 3'b100)begin
alu_op = 5'b00000;
dmem_access_reg = 4'b0011;
imm = {{20{inst[31]}},inst[31:20]};
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 1;
rf_wd_sel = 2'b10;
alu_src0_sel_reg = 0;
alu_src1_sel_reg = 1;
br_type_reg = 4'b1111;
end
//sw
if(inst[6:0] == 7'b0100011 && inst[14:12] == 3'b010)begin
alu_op = 5'b00000;
dmem_access_reg = 4'b1000;
imm = {{20{inst[31]}},inst[31:25],inst[11:7]};
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 0;
rf_wd_sel = 2'b11;
alu_src0_sel_reg = 0;
alu_src1_sel_reg = 1;
br_type_reg = 4'b1111;
end
//sh
if(inst[6:0] == 7'b0100011 && inst[14:12] == 3'b001)begin
alu_op = 5'b00000;
dmem_access_reg = 4'b1001;
imm = {{20{inst[31]}},inst[31:25],inst[11:7]};
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 0;
rf_wd_sel = 2'b11;
alu_src0_sel_reg = 0;
alu_src1_sel_reg = 1;
br_type_reg = 4'b1111;
end
//sb
if(inst[6:0] == 7'b0100011 && inst[14:12] == 3'b000)begin
alu_op = 5'b00000;
dmem_access_reg = 4'b1011;
imm = {{20{inst[31]}},inst[31:25],inst[11:7]};
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 0;
rf_wd_sel = 2'b11;
alu_src0_sel_reg = 0;
alu_src1_sel_reg = 1;
br_type_reg = 4'b1111;
end
//jalr
if(inst[6:0] == 7'b1100111 && inst[14:12] == 3'b000)begin
alu_op = 5'b00000;
dmem_access_reg = 4'b1111;
imm = {{20{inst[31]}},inst[31:20]};
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 1;
rf_wd_sel = 2'b00;
alu_src0_sel_reg = 0;
alu_src1_sel_reg = 1;
br_type_reg = 4'b1001;
end
//j&jal
if(inst[6:0] == 7'b1101111)begin
alu_op = 5'b00000;
dmem_access_reg = 4'b1111;
imm = {{12{inst[31]}},inst[31],inst[19:12],inst[20],inst[30:21],{1'b0}};
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 1;
rf_wd_sel = 2'b00;
alu_src0_sel_reg = 1;
alu_src1_sel_reg = 1;
br_type_reg = 4'b1000;
end
//beq
if(inst[6:0] == 7'b1100011 && inst[14:12] == 3'b000)begin
alu_op = 5'b00000;
dmem_access_reg = 4'b1111;
imm = {{19{inst[31]}},inst[31],inst[7],inst[30:25],inst[11:8],{1'b0}};
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 0;
rf_wd_sel = 2'b00;
alu_src0_sel_reg = 1;
alu_src1_sel_reg = 1;
br_type_reg = 4'b0000;
end
//bne
if(inst[6:0] == 7'b1100011 && inst[14:12] == 3'b001)begin
alu_op = 5'b00000;
dmem_access_reg = 4'b1111;
imm = {{19{inst[31]}},inst[31],inst[7],inst[30:25],inst[11:8],{1'b0}};
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 0;
rf_wd_sel = 2'b00;
alu_src0_sel_reg = 1;
alu_src1_sel_reg = 1;
br_type_reg = 4'b0001;
end
//blt
if(inst[6:0] == 7'b1100011 && inst[14:12] == 3'b100)begin
alu_op = 5'b00000;
dmem_access_reg = 4'b1111;
imm = {{19{inst[31]}},inst[31],inst[7],inst[30:25],inst[11:8],{1'b0}};
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 0;
rf_wd_sel = 2'b00;
alu_src0_sel_reg = 1;
alu_src1_sel_reg = 1;
br_type_reg = 4'b0010;
end
//bge
if(inst[6:0] == 7'b1100011 && inst[14:12] == 3'b101)begin
alu_op = 5'b00000;
dmem_access_reg = 4'b1111;
imm = {{19{inst[31]}},inst[31],inst[7],inst[30:25],inst[11:8],{1'b0}};
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 0;
rf_wd_sel = 2'b00;
alu_src0_sel_reg = 1;
alu_src1_sel_reg = 1;
br_type_reg = 4'b0100;
end
//bltu
if(inst[6:0] == 7'b1100011 && inst[14:12] == 3'b110)begin
alu_op = 5'b00000;
dmem_access_reg = 4'b1111;
imm = {{19{inst[31]}},inst[31],inst[7],inst[30:25],inst[11:8],{1'b0}};
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 0;
rf_wd_sel = 2'b00;
alu_src0_sel_reg = 1;
alu_src1_sel_reg = 1;
br_type_reg = 4'b0011;
end
//bgeu
if(inst[6:0] == 7'b1100011 && inst[14:12] == 3'b111)begin
alu_op = 5'b00000;
dmem_access_reg = 4'b1111;
imm = {{19{inst[31]}},inst[31],inst[7],inst[30:25],inst[11:8],{1'b0}};
rf_ra1_reg = inst[24:20];
rf_ra0_reg = inst[19:15];
rf_wa_reg = inst[11:7];
rf_we_reg = 0;
rf_wd_sel = 2'b00;
alu_src0_sel_reg = 1;
alu_src1_sel_reg = 1;
br_type_reg = 4'b0110;
end
//ebreak
if(inst == 32'h00100073)begin
alu_op = 5'b11111;
dmem_access_reg = 4'b1111;
imm = 0;
rf_ra1_reg = 0;
rf_ra0_reg = 0;
rf_wa_reg = 0;
rf_we_reg = 0;
rf_wd_sel = 2'b00;
alu_src0_sel_reg = 0;
alu_src1_sel_reg = 0;
br_type_reg = 4'b1111;
end
end
assign dmem_access = dmem_access_reg;
assign rf_ra0 = rf_ra0_reg;
assign rf_ra1 = rf_ra1_reg;
assign rf_wa = rf_wa_reg;
assign rf_we = rf_we_reg;
assign alu_src0_sel = alu_src0_sel_reg;
assign alu_src1_sel = alu_src1_sel_reg;
assign br_type = br_type_reg;
endmodule
这里注意,当一条指令用到了我们不需要的模块时,我采用了闲置信号,比如对dmem_access来说,我将其定义为4'b1111,表示什么也不做的意思。
这样一来,我们就实现了完整单周期CPU的所有模块编写,将他们连在一起,就不用我多加赘述了吧:
module CPU (
input [ 0 : 0] clk,
input [ 0 : 0] rst,
input [ 0 : 0] global_en,
/* ------------------------------ Memory (inst) ----------------------------- */
output [31 : 0] imem_raddr,
input [31 : 0] imem_rdata,
/* ------------------------------ Memory (data) ----------------------------- */
input [31 : 0] dmem_rdata,
output [ 0 : 0] dmem_we,
output [31 : 0] dmem_addr,
output [31 : 0] dmem_wdata,
/* ---------------------------------- Debug --------------------------------- */
output [ 0 : 0] commit,
output [31 : 0] commit_pc,
output [31 : 0] commit_inst,
output [ 0 : 0] commit_halt,
output [ 0 : 0] commit_reg_we,
output [ 4 : 0] commit_reg_wa,
output [31 : 0] commit_reg_wd,
output [ 0 : 0] commit_dmem_we,
output [31 : 0] commit_dmem_wa,
output [31 : 0] commit_dmem_wd,
input [ 4 : 0] debug_reg_ra,
output [31 : 0] debug_reg_rd
);
wire [31:0] cur_npc,cur_pc,pc_add4;
wire [31:0] cur_inst;
wire [4:0] cur_op;
wire [31:0] cur_imm;
wire [4:0] rf_ra0,rf_ra1;
wire [4:0] rf_wa;
wire [0:0] rf_we,dmem_en;
wire [0:0] alu_src0_sel,alu_src1_sel;
wire [31:0] data_r0,data_r1,res0,res1,alu_res,wd_res;
wire [31:0] dmem_rd_out,dmem_rd_in,dmem_wd_out;
wire [1:0] npc_sel,rf_wd_sel;
wire [3:0] dmem_access,br_type;
PC my_pc (
.clk (clk ),
.rst (rst ),
.en (global_en ), // 当 global_en 为高电平时,PC 才会更新,CPU 才会执行指令。
.npc (cur_npc ),
.pc (cur_pc )
);
pc_adder my_adder(
.pc(cur_pc),
.npc(pc_add4)
);
npc_mux my_npc_mux(
.pc_add4(pc_add4),
.pc_offset(alu_res),
.pc_j(alu_res&-1),
.npc_sel(npc_sel),
.npc(cur_npc)
);
INST_MEM my_im(
.clk(clk),
.a((cur_pc - 32'h00400000) / 4),
.d(0),
.we(0),
.spo(cur_inst)
);
DATA_MEM my_dm(
.clk(clk),
.a((alu_res - 32'h80000000) / 4),
.d(dmem_wd_out),
.we(dmem_en),
.spo(dmem_rd_in)
);
DECODER my_decoder(
.inst(cur_inst),
.alu_op(cur_op),
.dmem_access(dmem_access),
.imm(cur_imm),
.rf_ra0(rf_ra0),
.rf_ra1(rf_ra1),
.rf_wa(rf_wa),
.rf_we(rf_we),
.rf_wd_sel(rf_wd_sel),
.alu_src0_sel(alu_src0_sel),
.alu_src1_sel(alu_src1_sel),
.br_type(br_type)
);
MUX1 my_mux0(
.src0(data_r0),
.src1(cur_pc),
.sel(alu_src0_sel),
.res(res0)
);
MUX1 my_mux1(
.src0(data_r1),
.src1(cur_imm),
.sel(alu_src1_sel),
.res(res1)
);
MUX2 my_mux(
.src0(pc_add4),
.src1(alu_res),
.src2(dmem_rd_out),
.src3(0),
.sel(rf_wd_sel),
.res(wd_res)
);
BRANCH my_branch(
.br_type(br_type),
.br_src0(data_r0),
.br_src1(data_r1),
.npc_sel(npc_sel)
);
SLU my_slu(
.addr(alu_res),
.dmem_access(dmem_access),
.rd_in(dmem_rd_in),
.wd_in(data_r1),
.rd_out(dmem_rd_out),
.wd_out(dmem_wd_out),
.wd_we(dmem_en)
);
ALU my_alu(
.alu_src0(res0),
.alu_src1(res1),
.alu_op(cur_op),
.alu_res(alu_res)
);
REG_FILE my_reg(
.clk(clk),
.rf_ra0(rf_ra0),
.rf_ra1(rf_ra1),
.rf_wa(rf_wa),
.rf_we(rf_we),
.rf_wd(wd_res),
.debug_reg_ra(debug_reg_ra),
.debug_reg_rd(debug_reg_rd),
.rf_rd0(data_r0),
.rf_rd1(data_r1)
);
/* -------------------------------------------------------------------------- */
/* Commit */
/* -------------------------------------------------------------------------- */
wire [ 0 : 0] commit_if ;
assign commit_if = 1'H1;
reg [ 0 : 0] commit_reg ;
reg [31 : 0] commit_pc_reg ;
reg [31 : 0] commit_inst_reg ;
reg [ 0 : 0] commit_halt_reg ;
reg [ 0 : 0] commit_reg_we_reg ;
reg [ 4 : 0] commit_reg_wa_reg ;
reg [31 : 0] commit_reg_wd_reg ;
reg [ 0 : 0] commit_dmem_we_reg ;
reg [31 : 0] commit_dmem_wa_reg ;
reg [31 : 0] commit_dmem_wd_reg ;
always @(posedge clk) begin
if (rst) begin
commit_reg <= 1'H0;
commit_pc_reg <= 32'H0;
commit_inst_reg <= 32'H0;
commit_halt_reg <= 1'H0;
commit_reg_we_reg <= 1'H0;
commit_reg_wa_reg <= 5'H0;
commit_reg_wd_reg <= 32'H0;
commit_dmem_we_reg <= 1'H0;
commit_dmem_wa_reg <= 32'H0;
commit_dmem_wd_reg <= 32'H0;
end
else if (global_en) begin
commit_reg <= commit_if;
commit_pc_reg <= cur_pc;
commit_inst_reg <= cur_inst;
commit_halt_reg <= (cur_inst == 32'h00100073);
commit_reg_we_reg <= rf_we;
commit_reg_wa_reg <= rf_wa;
commit_reg_wd_reg <= alu_res;
commit_dmem_we_reg <= dmem_en;
commit_dmem_wa_reg <= alu_res;
commit_dmem_wd_reg <= dmem_wd_out;
end
end
assign commit = commit_reg;
assign commit_pc = commit_pc_reg;
assign commit_inst = commit_inst_reg;
assign commit_halt = commit_halt_reg;
assign commit_reg_we = commit_reg_we_reg;
assign commit_reg_wa = commit_reg_wa_reg;
assign commit_reg_wd = commit_reg_wd_reg;
assign commit_dmem_we = commit_dmem_we_reg;
assign commit_dmem_wa = commit_dmem_wa_reg;
assign commit_dmem_wd = commit_dmem_wd_reg;
endmodule
同样的烧写比特流上板运行这里就不做展示了,最后再展示。
流水线CPU
好了,有了上面我们写过的单周期CPU的基础,我们就可以将其改造为流水线CPU了,我们期望的流水线数据通路如下所示:
这张图已经让人很头大了,但其实还没画完,这个图为了简洁,略去了控制信号以及两个模块,分别是前递模块与段间寄存器控制模块,那我们与前面的单周期CPU做对比,会发现其实只是多了四个长框罢了,分别是IF/ID,ID/EX,EX/MEM与MEM/WB,这四个都是段间寄存器,用来进行不同时钟周期的数据传输功能,而我们把PCMUX放在了EX阶段,这也很好理解,它要的数据就是这个阶段才能完全产生的嘛 。
有了数据通路以及上面的解释,我们先来着手编写段间寄存器:
段间寄存器
结合数据通路,把对应的信号输入输出即可,这里注意,四个段间寄存器可以合并成一个,只需在例化时例化四个不同的即可,这里我为了方便区分,写了四个不同的。
IF/ID:
module IF_ID(
input [0:0] clk,
input [0:0] en,
input [0:0] rst,
input [31:0] pcadd4_if,
input [31:0] pc_if,
input [31:0] inst_if,
input [0:0] stall,
input [0:0] flush,
input [0:0] commit_if,
output reg [31:0] pcadd4_id,
output reg [31:0] pc_id,
output reg [31:0] inst_id,
output reg [0:0] commit_id
);
always @(posedge clk) begin
if(rst)begin
pcadd4_id <= 32'h00400004;
pc_id <= 32'h00400000;
inst_id <= 32'h00000013;
commit_id <= 0;
end
else if(en)begin
if(flush)begin
pcadd4_id <= 32'h00400004;
pc_id <= 32'h00400000;
inst_id <= 32'h00000013;
commit_id <= 0;
end
else if(stall)begin
pcadd4_id <= pcadd4_id;
pc_id <= pc_id;
inst_id <= inst_id;
commit_id <= commit_id;
end
else begin
pcadd4_id <= pcadd4_if;
pc_id <= pc_if;
inst_id <= inst_if;
commit_id <= commit_if;
end
end
end
endmodule
这里展示了IF/ID所用的所有信号量,其实不难看出,就是一个带暂停和清空功能的D触发器而已,这里的flush和stall信号是我们后面为了解决冒险问题而使用的,flush把所有信号清零(不是物理上的清零,而是恢复到最初的状态或者什么也不做的状态),而stall信号为1时所有数据都保持一个阶段,其余的三个段间寄存器类似,这里不再说明,直接给出代码:
ID/EX:
module ID_EX(
input [0:0] clk,
input [0:0] en,
input [0:0] rst,
input [31:0] pcadd4_id,
input [31:0] pc_id,
input [31:0] inst_id,
input [31:0] rf_rd0_id,
input [31:0] rf_rd1_id,
input [4:0] rf_ra0_id,
input [4:0] rf_ra1_id,
input [31:0] imm_id,
input [4:0] rf_wa_id,
input [0:0] rf_we_id,
input [0:0] stall,
input [0:0] flush,
input [0:0] commit_id,
input [4:0] alu_op_id,
input [3:0] dmem_access_id,
input [3:0] br_type_id,
input [1:0] rf_wd_sel_id,
input [0:0] alu_src0_sel_id,
input [0:0] alu_src1_sel_id,
output reg [31:0] pcadd4_ex,
output reg [31:0] pc_ex,
output reg [31:0] inst_ex,
output reg [31:0] rf_rd0_ex,
output reg [31:0] rf_rd1_ex,
output reg [4:0] rf_ra0_ex,
output reg [4:0] rf_ra1_ex,
output reg [31:0] imm_ex,
output reg [4:0] rf_wa_ex,
output reg [0:0] commit_ex,
output reg [0:0] rf_we_ex,
output reg [4:0] alu_op_ex,
output reg [3:0] dmem_access_ex,
output reg [3:0] br_type_ex,
output reg [1:0] rf_wd_sel_ex,
output reg [0:0] alu_src0_sel_ex,
output reg [0:0] alu_src1_sel_ex
);
always @(posedge clk) begin
if(rst)begin
pcadd4_ex <= 32'h00400004;
pc_ex <= 32'h00400000;
inst_ex <= 32'h00000013;
rf_rd0_ex <= 0;
rf_rd1_ex <= 0;
rf_ra0_ex <= 0;
rf_ra1_ex <= 0;
imm_ex <= 0;
rf_wa_ex <= 0;
commit_ex <= 0;
rf_we_ex <= 0;
alu_op_ex <= 0;
dmem_access_ex <= 4'b1111;
br_type_ex <= 4'b1111;
rf_wd_sel_ex <= 0;
alu_src0_sel_ex <= 0;
alu_src1_sel_ex <= 0;
end
else if(en)begin
if(flush)begin
pcadd4_ex <= 32'h00400004;
pc_ex <= 32'h00400000;
inst_ex <= 32'h00000013;
rf_rd0_ex <= 0;
rf_rd1_ex <= 0;
rf_ra0_ex <= 0;
rf_ra1_ex <= 0;
imm_ex <= 0;
rf_wa_ex <= 0;
commit_ex <= 0;
rf_we_ex <= 0;
alu_op_ex <= 0;
dmem_access_ex <= 4'b1111;
br_type_ex <= 4'b1111;
rf_wd_sel_ex <= 0;
alu_src0_sel_ex <= 0;
alu_src1_sel_ex <= 0;
end
else if(stall)begin
pcadd4_ex <= pcadd4_ex;
pc_ex <= pc_ex;
inst_ex <= inst_ex;
rf_rd0_ex <= rf_rd0_ex;
rf_rd1_ex <= rf_rd1_ex;
rf_ra0_ex <= rf_ra0_ex;
rf_ra1_ex <= rf_ra1_ex;
imm_ex <= imm_ex;
rf_wa_ex <= rf_wa_ex;
commit_ex <= commit_ex;
rf_we_ex <= rf_we_ex;
alu_op_ex <= alu_op_ex;
dmem_access_ex <= dmem_access_ex;
br_type_ex <= br_type_ex;
rf_wd_sel_ex <= rf_wd_sel_ex;
alu_src0_sel_ex <= alu_src0_sel_ex;
alu_src1_sel_ex <= alu_src1_sel_ex;
end
else begin
pcadd4_ex <= pcadd4_id;
pc_ex <= pc_id;
inst_ex <= inst_id;
rf_rd0_ex <= rf_rd0_id;
rf_rd1_ex <= rf_rd1_id;
rf_ra0_ex <= rf_ra0_id;
rf_ra1_ex <= rf_ra1_id;
imm_ex <= imm_id;
rf_wa_ex <= rf_wa_id;
commit_ex <= commit_id;
rf_we_ex <= rf_we_id;
alu_op_ex <= alu_op_id;
dmem_access_ex <= dmem_access_id;
br_type_ex <= br_type_id;
rf_wd_sel_ex <= rf_wd_sel_id;
alu_src0_sel_ex <= alu_src0_sel_id;
alu_src1_sel_ex <= alu_src1_sel_id;
end
end
end
endmodule
EX/MEM:
module EX_MEM(
input [0:0] clk,
input [0:0] en,
input [0:0] rst,
input [31:0] pcadd4_ex,
input [31:0] pc_ex,
input [31:0] inst_ex,
input [31:0] alu_res_ex,
input [31:0] rf_rd1_ex,
input [4:0] rf_wa_ex,
input [0:0] stall,
input [0:0] flush,
input [0:0] commit_ex,
input [0:0] rf_we_ex,
input [1:0] rf_wd_sel_ex,
input [3:0] dmem_access_ex,
output reg [31:0] pcadd4_mem,
output reg [31:0] alu_res_mem,
output reg [31:0] rf_rd1_mem,
output reg [4:0] rf_wa_mem,
output reg [0:0] commit_mem,
output reg [0:0] rf_we_mem,
output reg [1:0] rf_wd_sel_mem,
output reg [3:0] dmem_access_mem,
output reg [31:0] pc_mem,
output reg [31:0] inst_mem
);
always @(posedge clk) begin
if(rst)begin
pcadd4_mem <= 32'h00400004;
alu_res_mem <= 0;
rf_rd1_mem <= 0;
rf_wa_mem <= 0;
commit_mem <= 0;
rf_we_mem <= 0;
rf_wd_sel_mem <= 0;
dmem_access_mem <= 4'b1111;
pc_mem <= 32'h00400000;
inst_mem <= 32'h00000013;
end
else if(en)begin
if(flush)begin
pcadd4_mem <= 32'h00400004;
alu_res_mem <= 0;
rf_rd1_mem <= 0;
rf_wa_mem <= 0;
commit_mem <= 0;
rf_we_mem <= 0;
rf_wd_sel_mem <= 0;
dmem_access_mem <= 4'b1111;
pc_mem <= 32'h00400000;
inst_mem <= 32'h00000013;
end
else if(stall)begin
pcadd4_mem <= pcadd4_mem;
alu_res_mem <= alu_res_mem;
rf_rd1_mem <= rf_rd1_mem;
rf_wa_mem <= rf_wa_mem;
commit_mem <= commit_mem;
rf_we_mem <= rf_we_mem;
rf_wd_sel_mem <= rf_wd_sel_mem;
dmem_access_mem <= dmem_access_mem;
pc_mem <= pc_mem;
inst_mem <= inst_mem;
end
else begin
pcadd4_mem <= pcadd4_ex;
alu_res_mem <= alu_res_ex;
rf_rd1_mem <= rf_rd1_ex;
rf_wa_mem <= rf_wa_ex;
commit_mem <= commit_ex;
rf_we_mem <= rf_we_ex;
rf_wd_sel_mem <= rf_wd_sel_ex;
dmem_access_mem <= dmem_access_ex;
pc_mem <= pc_ex;
inst_mem <= inst_ex;
end
end
end
endmodule
MEM/WB:
module MEM_WB(
input [0:0] clk,
input [0:0] en,
input [0:0] rst,
input [31:0] pcadd4_mem,
input [31:0] pc_mem,
input [31:0] inst_mem,
input [31:0] alu_res_mem,
input [31:0] dmem_rd_out_mem,
input [4:0] rf_wa_mem,
input [0:0] stall,
input [0:0] flush,
input [0:0] commit_mem,
input [0:0] rf_we_mem,
input [1:0] rf_wd_sel_mem,
output reg [31:0] pcadd4_wb,
output reg [31:0] alu_res_wb,
output reg [31:0] dmem_rd_out_wb,
output reg [4:0] rf_wa_wb,
output reg [0:0] commit_wb,
output reg [0:0] rf_we_wb,
output reg [1:0] rf_wd_sel_wb,
output reg [31:0] pc_wb,
output reg [31:0] inst_wb
);
always @(posedge clk) begin
if(rst)begin
pcadd4_wb <= 32'h00400004;
alu_res_wb <= 0;
dmem_rd_out_wb <= 0;
rf_wa_wb <= 0;
commit_wb <= 0;
rf_we_wb <= 0;
rf_wd_sel_wb <= 0;
pc_wb <= 32'h00400000;
inst_wb <= 32'h00000013;
end
else if(en)begin
if(flush)begin
pcadd4_wb <= 32'h00400004;
alu_res_wb <= 0;
dmem_rd_out_wb <= 0;
rf_wa_wb <= 0;
commit_wb <= 0;
rf_we_wb <= 0;
rf_wd_sel_wb <= 0;
pc_wb <= 32'h00400000;
inst_wb <= 32'h00000013;
end
else if(stall)begin
pcadd4_wb <= pcadd4_wb;
alu_res_wb <= alu_res_wb;
dmem_rd_out_wb <= dmem_rd_out_wb;
rf_wa_wb <= rf_wa_wb;
commit_wb <= commit_wb;
rf_we_wb <= rf_we_wb;
rf_wd_sel_wb <= rf_wd_sel_wb;
pc_wb <= pc_wb;
inst_wb <= inst_wb;
end
else begin
pcadd4_wb <= pcadd4_mem;
alu_res_wb <= alu_res_mem;
dmem_rd_out_wb <= dmem_rd_out_mem;
rf_wa_wb <= rf_wa_mem;
commit_wb <= commit_mem;
rf_we_wb <= rf_we_mem;
rf_wd_sel_wb <= rf_wd_sel_mem;
pc_wb <= pc_mem;
inst_wb <= inst_mem;
end
end
end
endmodule
前递模块
为了解决数据冒险,我们采用前递模块。那么首先,什么叫数据冒险?顾名思义,就是数据出现了冒险(不是),假如我们有两条连续的add指令,前一个把计算结果储存到了x1当中,而后一个要把x1的值加上另一个寄存器的值。显然我们期望的结果是后一个x1当中的值是我们前面计算过后的值,但在流水线中,我们知道,写回在WB阶段,这就导致了当下一个指令执行到EX阶段也就是要执行加法操作时,上一条指令还在MEM阶段,这就不能保证我们使用的数据是最新的,所以我们需要在mem阶段把上一个阶段即ex段产生的结果送回ex段来使用,这就是数据前递。那该如何判断前递呢?也很简单,只需判断MEM/WB段写使能为1,且写入非x0;EX段某读寄存器地址等于MEM/WB段的写地址即可。想一想,是不是这个道理,我们是把写使能一直传递下去的,到这里就发挥了它的用途,代码如下:
module Fowarding (
input [0:0] rf_we_mem,
input [0:0] rf_we_wb,
input [4:0] rf_wa_mem,
input [4:0] rf_wa_wb,
input [31:0] rf_wd_mem,
input [31:0] rf_wd_wb,
input [4:0] rf_ra0_ex,
input [4:0] rf_ra1_ex,
output reg [0:0] rf_rd0_fe,
output reg [0:0] rf_rd1_fe,
output reg [31:0] rf_rd0_fd,
output reg [31:0] rf_rd1_fd
);
initial begin
rf_rd0_fe = 0;
rf_rd1_fe = 0;
end
always @(*) begin
if(rf_we_mem && rf_wa_mem != 0 && rf_wa_mem == rf_ra0_ex)begin
rf_rd0_fe = 1;
rf_rd0_fd = rf_wd_mem;
end
else if(rf_we_wb && rf_wa_wb != 0 && !(rf_we_mem && rf_wa_mem != 0 && rf_wa_mem == rf_ra0_ex) && rf_wa_wb == rf_ra0_ex)begin
rf_rd0_fe = 1;
rf_rd0_fd = rf_wd_wb;
end
else begin
rf_rd0_fe = 0;
end
if(rf_we_mem && rf_wa_mem != 0 && rf_wa_mem == rf_ra1_ex)begin
rf_rd1_fe = 1;
rf_rd1_fd = rf_wd_mem;
end
else if(rf_we_wb && rf_wa_wb != 0 && !(rf_we_mem && rf_wa_mem != 0 && rf_wa_mem == rf_ra1_ex) && rf_wa_wb == rf_ra1_ex)begin
rf_rd1_fe = 1;
rf_rd1_fd = rf_wd_wb;
end
else begin
rf_rd1_fe = 0;
end
end
endmodule
这里注意前递后会送入一个信号rf_rd_fe,这个用来选择ALU以及BRANCH的输入数据是原始数据还是前递数据的,所以我们需要在CPU中多添加两个MUX来选择数据。
为了前递模块正常工作,我们要更改寄存器,让它写优先,这样就能读出正在写的数据了:
module REG_FILE (
input [ 0 : 0] clk,
input [ 4 : 0] rf_ra0,
input [ 4 : 0] rf_ra1,
input [ 4 : 0] rf_wa,
input [ 0 : 0] rf_we,
input [31 : 0] rf_wd,
input [4 : 0] debug_reg_ra,
output [31:0] debug_reg_rd,
output [31 : 0] rf_rd0,
output [31 : 0] rf_rd1
);
reg [31 : 0] reg_file [0 : 31];
integer i;
initial begin
for (i = 0; i < 32; i = i + 1)
reg_file[i] = 0;
end
always @(posedge clk) begin
if(rf_we)begin
if(rf_wa !=0 )begin
reg_file[rf_wa] <= rf_wd;
end
end
end
assign rf_rd0 = (rf_we && rf_wa != 0 && rf_wa == rf_ra0) ? rf_wd : reg_file[rf_ra0];
assign rf_rd1 = (rf_we && rf_wa != 0 && rf_wa == rf_ra1) ? rf_wd : reg_file[rf_ra1];
assign debug_reg_rd = reg_file[debug_reg_ra];
endmodule
段间寄存器控制模块
还有一种数据冒险,如果我们上一条指令是load型指令,而下一条假设为add型,那么我们就无法直接前递,因为这个要等mem阶段完成才行,而mem阶段的延迟又太高,这就导致了EX段的延迟也增加了,这是我们不能接受的,所以我们采用插入气泡的方式,即模拟两个指令间有一个nop操作,来实现mem取数据的同时也不增加ex的延迟,判断方法如下:EX 段的指令为读取内存的指令;EX 段指令写入寄存器地址非零;ID 段某读寄存器地址等于 EX 段的写地址。
同时还有一种叫做结构冒险的东西,这个是在进行跳转时,由于是在EX段执行,我们前面的IF段与ID段的数据,如果要跳转就不能再使用了,故我们需要进行清空操作来消除这已经进入流水线的指令,具体判断即信号实现方法如下所示:
module SegCtrl (
input [0:0] rf_we_ex,
input [1:0] rf_wd_sel_ex,
input [4:0] rf_wa_ex,
input [4:0] rf_ra0_id,
input [4:0] rf_ra1_id,
input [1:0] npc_sel_ex,
output reg [0:0] stall_pc,
output reg [0:0] stall_if_id,
output reg [0:0] flush_if_id,
output reg [0:0] flush_id_ex
);
initial begin
stall_pc = 0;
stall_if_id = 0;
flush_if_id = 0;
flush_id_ex = 0;
end
always @(*) begin
if(rf_wd_sel_ex == 2'b10 && rf_we_ex && rf_wa_ex != 0 && (rf_wa_ex == rf_ra0_id || rf_wa_ex == rf_ra1_id))begin
stall_if_id = 1;
flush_id_ex = 1;
stall_pc = 1;
end
else if(npc_sel_ex == 2'b01 || npc_sel_ex == 2'b10)begin
flush_id_ex = 1;
flush_if_id = 1;
end
else begin
stall_pc = 0;
stall_if_id = 0;
flush_if_id = 0;
flush_id_ex = 0;
end
end
endmodule
这里可以看出,只flush一个段间寄存器实现了插入一个气泡的功能,而flush两个实现了插入两个气泡的功能。
最后,我们也是要把他们连在一起,这个的接法有点复杂了,如下所示:
module CPU (
input [ 0 : 0] clk,
input [ 0 : 0] rst,
input [ 0 : 0] global_en,
/* ------------------------------ Memory (inst) ----------------------------- */
output [31 : 0] imem_raddr,
input [31 : 0] imem_rdata,
/* ------------------------------ Memory (data) ----------------------------- */
input [31 : 0] dmem_rdata,
output [ 0 : 0] dmem_we,
output [31 : 0] dmem_addr,
output [31 : 0] dmem_wdata,
/* ---------------------------------- Debug --------------------------------- */
output [ 0 : 0] commit,
output [31 : 0] commit_pc,
output [31 : 0] commit_inst,
output [ 0 : 0] commit_halt,
output [ 0 : 0] commit_reg_we,
output [ 4 : 0] commit_reg_wa,
output [31 : 0] commit_reg_wd,
output [ 0 : 0] commit_dmem_we,
output [31 : 0] commit_dmem_wa,
output [31 : 0] commit_dmem_wd,
input [ 4 : 0] debug_reg_ra,
output [31 : 0] debug_reg_rd
);
reg [0:0] flush_fd = 0;
wire [31:0] cur_npc,cur_pc,pc_add4;
wire [31:0] cur_inst;
wire [4:0] cur_op;
wire [31:0] cur_imm,imm_id;
wire [4:0] rf_ra0,rf_ra1;
wire [4:0] rf_wa,rf_wa_id,rf_ra0_id,rf_ra1_id,rf_ra0_ex,rf_ra1_ex;
wire [0:0] rf_we,dmem_en,rf_rd0_fe,rf_rd1_fe;
wire [0:0] alu_src0_sel,alu_src1_sel;
wire [31:0] data_r0,data_r1,res0,res1,alu_res,wd_res,rf_rd0_raw_ex,rf_rd1_raw_ex,rf_rd0_fd,rf_rd1_fd;
wire [31:0] dmem_rd_out,dmem_rd_in,dmem_wd_out;
wire [1:0] npc_sel,rf_wd_sel,rf_wd_sel_ex,rf_wd_sel_mem,rf_wd_sel_wb;
wire [3:0] dmem_access,br_type;
wire [31:0] pcadd4_id,pc_id,inst_id,pc_mem,pc_wb,inst_ex,inst_mem,inst_wb,rf_rd0_id,rf_rd1_id;
wire [31:0] pcadd4_ex,pc_ex,rf_rd0_ex,rf_rd1_ex,imm_ex;
wire [4:0] rf_wa_ex;
wire [31:0] pcadd4_mem,alu_res_mem,alu_res_ex,rf_rd1_mem;
wire [4:0] rf_wa_mem;
wire [31:0] pcadd4_wb,alu_res_wb,dmem_rd_out_mem,dmem_rd_out_wb;
wire [4:0] rf_wa_wb;
wire [0:0] commit_id,commit_ex,commit_mem,commit_wb;
wire [0:0] rf_we_ex,rf_we_mem,rf_we_wb;
wire [4:0] alu_op_ex;
wire [3:0] dmem_access_ex,br_type_ex,dmem_access_mem;
wire [0:0] alu_src0_sel_ex,alu_src1_sel_ex;
wire [0:0] stall_pc,stall_if_id,flush_if_id,flush_id_ex;
//段间寄存器
IF_ID cpu_if_id(
.clk(clk),
.en(global_en),
.rst(rst),
.pcadd4_if(pc_add4),
.pc_if(cur_pc),
.inst_if(cur_inst),
.stall(stall_if_id),
.flush(flush_if_id),
.commit_if(commit_if),
.pcadd4_id(pcadd4_id),
.pc_id(pc_id),
.inst_id(inst_id),
.commit_id(commit_id)
);
ID_EX cpu_id_ex(
.clk(clk),
.en(global_en),
.rst(rst),
.pcadd4_id(pcadd4_id),
.pc_id(pc_id),
.inst_id(inst_id),
.rf_rd0_id(rf_rd0_id),
.rf_rd1_id(rf_rd1_id),
.rf_ra0_id(rf_ra0),
.rf_ra1_id(rf_ra1),
.imm_id(imm_id),
.rf_wa_id(rf_wa_id),
.rf_we_id(rf_we),
.stall(0),
.flush(flush_id_ex),
.commit_id(commit_id),
.alu_op_id(cur_op),
.dmem_access_id(dmem_access),
.br_type_id(br_type),
.rf_wd_sel_id(rf_wd_sel),
.alu_src0_sel_id(alu_src0_sel),
.alu_src1_sel_id(alu_src1_sel),
.pcadd4_ex(pcadd4_ex),
.pc_ex(pc_ex),
.inst_ex(inst_ex),
.rf_rd0_ex(rf_rd0_raw_ex),
.rf_rd1_ex(rf_rd1_raw_ex),
.rf_ra0_ex(rf_ra0_ex),
.rf_ra1_ex(rf_ra1_ex),
.imm_ex(imm_ex),
.rf_wa_ex(rf_wa_ex),
.commit_ex(commit_ex),
.rf_we_ex(rf_we_ex),
.alu_op_ex(alu_op_ex),
.dmem_access_ex(dmem_access_ex),
.br_type_ex(br_type_ex),
.rf_wd_sel_ex(rf_wd_sel_ex),
.alu_src0_sel_ex(alu_src0_sel_ex),
.alu_src1_sel_ex(alu_src1_sel_ex)
);
EX_MEM cpu_ex_mem(
.clk(clk),
.en(global_en),
.rst(rst),
.pcadd4_ex(pcadd4_ex),
.pc_ex(pc_ex),
.inst_ex(inst_ex),
.alu_res_ex(alu_res_ex),
.rf_rd1_ex(rf_rd1_ex),
.rf_wa_ex(rf_wa_ex),
.rf_we_ex(rf_we_ex),
.rf_wd_sel_ex(rf_wd_sel_ex),
.dmem_access_ex(dmem_access_ex),
.stall(0),
.flush(0),
.commit_ex(commit_ex),
.pcadd4_mem(pcadd4_mem),
.pc_mem(pc_mem),
.inst_mem(inst_mem),
.alu_res_mem(alu_res_mem),
.rf_rd1_mem(rf_rd1_mem),
.rf_wa_mem(rf_wa_mem),
.commit_mem(commit_mem),
.rf_we_mem(rf_we_mem),
.rf_wd_sel_mem(rf_wd_sel_mem),
.dmem_access_mem(dmem_access_mem)
);
MEM_WB cpu_mem_wb(
.clk(clk),
.en(global_en),
.rst(rst),
.pcadd4_mem(pcadd4_mem),
.pc_mem(pc_mem),
.inst_mem(inst_mem),
.alu_res_mem(alu_res_mem),
.dmem_rd_out_mem(dmem_rd_out_mem),
.rf_wa_mem(rf_wa_mem),
.rf_we_mem(rf_we_mem),
.rf_wd_sel_mem(rf_wd_sel_mem),
.stall(0),
.flush(0),
.commit_mem(commit_mem),
.pcadd4_wb(pcadd4_wb),
.pc_wb(pc_wb),
.inst_wb(inst_wb),
.alu_res_wb(alu_res_wb),
.dmem_rd_out_wb(dmem_rd_out_wb),
.rf_wa_wb(rf_wa_wb),
.commit_wb(commit_wb),
.rf_we_wb(rf_we_wb),
.rf_wd_sel_wb(rf_wd_sel_wb)
);
Fowarding my_foward(
.rf_we_mem(rf_we_mem),
.rf_we_wb(rf_we_wb),
.rf_wa_mem(rf_wa_mem),
.rf_wa_wb(rf_wa_wb),
.rf_wd_mem(alu_res_mem),
.rf_wd_wb(wd_res),
.rf_ra0_ex(rf_ra0_ex),
.rf_ra1_ex(rf_ra1_ex),
.rf_rd0_fe(rf_rd0_fe),
.rf_rd1_fe(rf_rd1_fe),
.rf_rd0_fd(rf_rd0_fd),
.rf_rd1_fd(rf_rd1_fd)
);
SegCtrl my_seg(
.rf_we_ex(rf_we_ex),
.rf_wd_sel_ex(rf_wd_sel_ex),
.rf_wa_ex(rf_wa_ex),
.rf_ra0_id(rf_ra0_id),
.rf_ra1_id(rf_ra1_id),
.npc_sel_ex(npc_sel),
.stall_pc(stall_pc),
.stall_if_id(stall_if_id),
.flush_if_id(flush_if_id),
.flush_id_ex(flush_id_ex)
);
PC my_pc (
.clk (clk ),
.rst (rst ),
.en (global_en ),
.stall_pc(stall_pc),
.npc (cur_npc ),
.pc (cur_pc )
);
pc_adder my_adder(
.pc(cur_pc),
.npc(pc_add4)
);
npc_mux my_npc_mux(
.pc_add4(pc_add4),
.pc_offset(alu_res_ex),
.pc_j(alu_res_ex&-1),
.npc_sel(npc_sel),
.npc(cur_npc)
);
INST_MEM my_im(
.clk(clk),
.a((cur_pc - 32'h00400000) / 4),
.d(0),
.we(0),
.spo(cur_inst)
);
DATA_MEM my_dm(
.clk(clk),
.a((alu_res_mem - 32'h80000000) / 4),
.d(dmem_wd_out),
.we(dmem_en),
.spo(dmem_rd_in)
);
DECODER my_decoder(
.inst(inst_id),
.alu_op(cur_op),
.dmem_access(dmem_access),
.imm(imm_id),
.rf_ra0(rf_ra0),
.rf_ra1(rf_ra1),
.rf_wa(rf_wa_id),
.rf_we(rf_we),
.rf_wd_sel(rf_wd_sel),
.alu_src0_sel(alu_src0_sel),
.alu_src1_sel(alu_src1_sel),
.br_type(br_type)
);
MUX1 my_mux0(
.src0(rf_rd0_ex),
.src1(pc_ex),
.sel(alu_src0_sel_ex),
.res(res0)
);
MUX1 my_mux1(
.src0(rf_rd1_ex),
.src1(imm_ex),
.sel(alu_src1_sel_ex),
.res(res1)
);
MUX1 mux_rd0(
.src0(rf_rd0_raw_ex),
.src1(rf_rd0_fd),
.sel(rf_rd0_fe),
.res(rf_rd0_ex)
);
MUX1 mux_rd1(
.src0(rf_rd1_raw_ex),
.src1(rf_rd1_fd),
.sel(rf_rd1_fe),
.res(rf_rd1_ex)
);
MUX2 my_mux(
.src0(pcadd4_wb),
.src1(alu_res_wb),
.src2(dmem_rd_out_wb),
.src3(0),
.sel(rf_wd_sel_wb),
.res(wd_res)
);
BRANCH my_branch(
.br_type(br_type_ex),
.br_src0(rf_rd0_ex),
.br_src1(rf_rd1_ex),
.npc_sel(npc_sel)
);
SLU my_slu(
.addr(alu_res_mem),
.dmem_access(dmem_access_mem),
.rd_in(dmem_rd_in),
.wd_in(rf_rd1_mem),
.rd_out(dmem_rd_out_mem),
.wd_out(dmem_wd_out),
.wd_we(dmem_en)
);
ALU my_alu(
.alu_src0(res0),
.alu_src1(res1),
.alu_op(alu_op_ex),
.alu_res(alu_res_ex)
);
REG_FILE my_reg(
.clk(clk),
.rf_ra0(rf_ra0),
.rf_ra1(rf_ra1),
.rf_wa(rf_wa_wb),
.rf_we(rf_we_wb),
.rf_wd(wd_res),
.debug_reg_ra(debug_reg_ra),
.debug_reg_rd(debug_reg_rd),
.rf_rd0(rf_rd0_id),
.rf_rd1(rf_rd1_id)
);
/* -------------------------------------------------------------------------- */
/* Commit */
/* -------------------------------------------------------------------------- */
wire [ 0 : 0] commit_if ;
assign commit_if = 1'H1;
reg [ 0 : 0] commit_reg ;
reg [31 : 0] commit_pc_reg ;
reg [31 : 0] commit_inst_reg ;
reg [ 0 : 0] commit_halt_reg ;
reg [ 0 : 0] commit_reg_we_reg ;
reg [ 4 : 0] commit_reg_wa_reg ;
reg [31 : 0] commit_reg_wd_reg ;
reg [ 0 : 0] commit_dmem_we_reg ;
reg [31 : 0] commit_dmem_wa_reg ;
reg [31 : 0] commit_dmem_wd_reg ;
always @(posedge clk) begin
if (rst) begin
commit_reg <= 1'H0;
commit_pc_reg <= 32'H0;
commit_inst_reg <= 32'H0;
commit_halt_reg <= 1'H0;
commit_reg_we_reg <= 1'H0;
commit_reg_wa_reg <= 5'H0;
commit_reg_wd_reg <= 32'H0;
commit_dmem_we_reg <= 1'H0;
commit_dmem_wa_reg <= 32'H0;
commit_dmem_wd_reg <= 32'H0;
end
else if (global_en) begin
commit_reg <= commit_wb;
commit_pc_reg <= pc_wb;
commit_inst_reg <= inst_wb;
commit_halt_reg <= (inst_wb == 32'h00100073);
commit_reg_we_reg <= rf_we_wb;
commit_reg_wa_reg <= rf_wa_wb;
commit_reg_wd_reg <= wd_res;
commit_dmem_we_reg <= dmem_en;
commit_dmem_wa_reg <= alu_res_wb;
commit_dmem_wd_reg <= dmem_wd_out;
end
end
assign commit = commit_reg;
assign commit_pc = commit_pc_reg;
assign commit_inst = commit_inst_reg;
assign commit_halt = commit_halt_reg;
assign commit_reg_we = commit_reg_we_reg;
assign commit_reg_wa = commit_reg_wa_reg;
assign commit_reg_wd = commit_reg_wd_reg;
assign commit_dmem_we = commit_dmem_we_reg;
assign commit_dmem_wa = commit_dmem_wa_reg;
assign commit_dmem_wd = commit_dmem_wd_reg;
endmodule
这里的信号量有点乱,我在github上的进行了优化,可以参考那个以及数据通路来理解。
仿真结果这里就不做展示了,直接看上板运行结果(测试文件我也放在github上了):
这个运行结果是符合预期的,至此,我们就实现了简单的流水线CPU啦!