【RISC-V设计-11】- RISC-V处理器设计K0A之TOP
1.简介
本文是对前几篇文章设计的模块的例化,作为RISCV-K0A
的设计顶层,着重于架构和框架的搭建,并未深入涉及具体的逻辑功能层面。
2.顶层设计
3.端口说明
序号 | 端口 | 位宽 | 方向 | 说明 |
---|---|---|---|---|
1 | core_clk | 1 | input | 内核时钟 |
2 | core_rstn | 1 | input | 内核复位信号,低有效 |
3 | bus_avalid | 1 | output | 总线的地址有效信号 |
4 | bus_aready | 1 | input | 总线的地址就绪信号 |
5 | bus_write | 1 | output | 总线的写使能信号 |
6 | bus_addr | 18 | output | 总线地址 |
7 | bus_strb | 4 | output | 总线写字节有效信号 |
8 | bus_wdata | 32 | output | 总线写数据 |
9 | bus_rvalid | 1 | input | 总线读有效信号 |
10 | bus_rready | 1 | output | 总线读就绪信号 |
11 | bus_rdata | 32 | input | 总线读数据 |
12 | irq_lines | 16 | input | 外部中断信号,高电平/脉冲触发 |
4.代码设计
// -------------------------------------------------------------------------------------------------
// Copyright 2024 Kearn Chen, kearn.chen@aliyun.com
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// -------------------------------------------------------------------------------------------------
// Description :
// 1. Core Top Wrapper
// -------------------------------------------------------------------------------------------------
module k0a_core_top (
input wire core_clk ,
input wire core_rstn ,
output wire bus_avalid ,
input wire bus_aready ,
output wire bus_write ,
output wire [17:0] bus_addr ,
output wire [3:0] bus_strb ,
output wire [31:0] bus_wdata ,
input wire bus_rvalid ,
output wire bus_rready ,
input wire [31:0] bus_rdata ,
input wire [15:0] irq_lines
);
wire idu2bmu_pc_set ;
wire [17:0] idu2bmu_pc_new ;
wire bmu2idu_pc_ack ;
wire idu2bmu_ls_req ;
wire idu2bmu_ls_cmd ;
wire [1:0] idu2bmu_ls_size ;
wire [19:0] idu2bmu_ls_addr ;
wire [31:0] idu2bmu_ls_wdata ;
wire [31:0] bmu2idu_ls_rdata ;
wire bmu2idu_valid ;
wire [31:0] bmu2idu_instr ;
wire idu2bmu_resp ;
wire idu2bmu_wfi ;
wire [17:0] bmu2idu_pc_cur ;
wire [17:0] bmu2idu_pc_nxt ;
wire idu2gpr_we ;
wire [3:0] idu2gpr_waddr ;
wire [31:0] idu2gpr_wdata ;
wire [3:0] idu2gpr_raddr1 ;
wire [3:0] idu2gpr_raddr2 ;
wire [31:0] gpr2idu_rdata1 ;
wire [31:0] gpr2idu_rdata2 ;
wire idu2csr_we ;
wire [11:0] idu2csr_addr ;
wire [31:0] idu2csr_wdata ;
wire [31:0] csr2idu_rdata ;
wire [17:0] csr2idu_mepc ;
wire [17:0] csr2idu_mtvec ;
wire idu2csr_mepc_set ;
wire [17:0] idu2csr_mepc_nxt ;
wire csr2cic_gie ;
wire [15:0] csr2cic_mie ;
wire [15:0] csr2cic_mip ;
wire [15:0] cic2csr_irq ;
wire [4:0] cic2csr_mcause ;
wire [3:0] idu2alu_op ;
wire [31:0] idu2alu_rs1 ;
wire [31:0] idu2alu_rs2 ;
wire [31:0] alu2idu_res ;
wire alu2idu_cmp ;
wire [19:0] idu2alu_addr1 ;
wire [19:0] idu2alu_addr2 ;
wire [19:0] alu2idu_addro ;
wire cic2idu_int_req ;
wire idu2cic_int_ack ;
wire idu2cic_int_mret ;
k0a_core_bmu u_core_bmu (
.core_clk (core_clk ),
.core_rstn (core_rstn ),
.bus_avalid (bus_avalid ),
.bus_aready (bus_aready ),
.bus_write (bus_write ),
.bus_addr (bus_addr ),
.bus_strb (bus_strb ),
.bus_wdata (bus_wdata ),
.bus_rvalid (bus_rvalid ),
.bus_rready (bus_rready ),
.bus_rdata (bus_rdata ),
.idu2bmu_pc_set (idu2bmu_pc_set ),
.idu2bmu_pc_new (idu2bmu_pc_new ),
.bmu2idu_pc_ack (bmu2idu_pc_ack ),
.idu2bmu_ls_req (idu2bmu_ls_req ),
.idu2bmu_ls_cmd (idu2bmu_ls_cmd ),
.idu2bmu_ls_size (idu2bmu_ls_size ),
.idu2bmu_ls_addr (idu2bmu_ls_addr ),
.idu2bmu_ls_wdata (idu2bmu_ls_wdata ),
.bmu2idu_ls_rdata (bmu2idu_ls_rdata ),
.bmu2idu_valid (bmu2idu_valid ),
.bmu2idu_instr (bmu2idu_instr ),
.bmu2idu_pc_cur (bmu2idu_pc_cur ),
.bmu2idu_pc_nxt (bmu2idu_pc_nxt )
);
k0a_core_idu u_core_idu (
.bmu2idu_valid (bmu2idu_valid ),
.bmu2idu_instr (bmu2idu_instr ),
.bmu2idu_pc_cur (bmu2idu_pc_cur ),
.bmu2idu_pc_nxt (bmu2idu_pc_nxt ),
.idu2bmu_pc_set (idu2bmu_pc_set ),
.bmu2idu_pc_ack (bmu2idu_pc_ack ),
.idu2bmu_pc_new (idu2bmu_pc_new ),
.idu2bmu_ls_req (idu2bmu_ls_req ),
.idu2bmu_ls_cmd (idu2bmu_ls_cmd ),
.idu2bmu_ls_size (idu2bmu_ls_size ),
.idu2bmu_ls_addr (idu2bmu_ls_addr ),
.idu2bmu_ls_wdata (idu2bmu_ls_wdata ),
.bmu2idu_ls_rdata (bmu2idu_ls_rdata ),
.idu2gpr_we (idu2gpr_we ),
.idu2gpr_waddr (idu2gpr_waddr ),
.idu2gpr_wdata (idu2gpr_wdata ),
.idu2gpr_raddr1 (idu2gpr_raddr1 ),
.idu2gpr_raddr2 (idu2gpr_raddr2 ),
.gpr2idu_rdata1 (gpr2idu_rdata1 ),
.gpr2idu_rdata2 (gpr2idu_rdata2 ),
.idu2csr_we (idu2csr_we ),
.idu2csr_addr (idu2csr_addr ),
.idu2csr_wdata (idu2csr_wdata ),
.csr2idu_rdata (csr2idu_rdata ),
.csr2idu_mepc (csr2idu_mepc ),
.csr2idu_mtvec (csr2idu_mtvec ),
.idu2csr_mepc_set (idu2csr_mepc_set ),
.idu2csr_mepc_nxt (idu2csr_mepc_nxt ),
.idu2alu_op (idu2alu_op ),
.idu2alu_rs1 (idu2alu_rs1 ),
.idu2alu_rs2 (idu2alu_rs2 ),
.alu2idu_res (alu2idu_res ),
.alu2idu_cmp (alu2idu_cmp ),
.idu2alu_addr1 (idu2alu_addr1 ),
.idu2alu_addr2 (idu2alu_addr2 ),
.alu2idu_addro (alu2idu_addro ),
.cic2idu_int_req (cic2idu_int_req ),
.idu2cic_int_ack (idu2cic_int_ack ),
.idu2cic_int_mret (idu2cic_int_mret )
);
k0a_core_gpr u_core_gpr (
.core_clk (core_clk ),
.idu2gpr_we (idu2gpr_we ),
.idu2gpr_waddr (idu2gpr_waddr ),
.idu2gpr_wdata (idu2gpr_wdata ),
.idu2gpr_raddr1 (idu2gpr_raddr1 ),
.gpr2idu_rdata1 (gpr2idu_rdata1 ),
.idu2gpr_raddr2 (idu2gpr_raddr2 ),
.gpr2idu_rdata2 (gpr2idu_rdata2 )
);
k0a_core_csr u_core_csr (
.core_clk (core_clk ),
.core_rstn (core_rstn ),
.idu2csr_we (idu2csr_we ),
.idu2csr_addr (idu2csr_addr ),
.idu2csr_wdata (idu2csr_wdata ),
.csr2idu_rdata (csr2idu_rdata ),
.csr2cic_gie (csr2cic_gie ),
.csr2cic_mie (csr2cic_mie ),
.csr2cic_mip (csr2cic_mip ),
.cic2csr_irq (cic2csr_irq ),
.cic2csr_mcause (cic2csr_mcause ),
.csr2idu_mepc (csr2idu_mepc ),
.csr2idu_mtvec (csr2idu_mtvec ),
.idu2csr_mepc_set (idu2csr_mepc_set ),
.idu2csr_mepc_nxt (idu2csr_mepc_nxt )
);
k0a_core_cic u_core_cic (
.core_clk (core_clk ),
.core_rstn (core_rstn ),
.irq_lines (irq_lines ),
.csr2cic_gie (csr2cic_gie ),
.csr2cic_mie (csr2cic_mie ),
.csr2cic_mip (csr2cic_mip ),
.cic2csr_irq (cic2csr_irq ),
.cic2csr_mcause (cic2csr_mcause ),
.cic2idu_int_req (cic2idu_int_req ),
.idu2cic_int_ack (idu2cic_int_ack ),
.idu2cic_int_mret (idu2cic_int_mret )
);
k0a_core_alu u_core_alu (
.idu2alu_op (idu2alu_op ),
.idu2alu_rs1 (idu2alu_rs1 ),
.idu2alu_rs2 (idu2alu_rs2 ),
.alu2idu_res (alu2idu_res ),
.alu2idu_cmp (alu2idu_cmp ),
.idu2alu_addr1 (idu2alu_addr1 ),
.idu2alu_addr2 (idu2alu_addr2 ),
.alu2idu_addro (alu2idu_addro )
);
endmodule
5.逻辑综合
以某90nm的数字标准单元库综合的结果如下所示
5.1 面积
Number of ports: 205
Number of nets: 3227
Number of cells: 3054
Number of combinational cells: 2341
Number of sequential cells: 689
Number of macros/black boxes: 0
Number of buf/inv: 220
Number of references: 81
Combinational area: 9934.848258
Buf/Inv area: 477.691216
Noncombinational area: 9358.372642
Macro/Black Box area: 0.000000
Net Interconnect area: undefined (No wire load specified)
Total cell area: 19293.220900
Total area: undefined
总共的逻辑面积为19293 um^2,最小二输入与非门面积为2.8224 um^2,等效逻辑门数量为19293.2209 / 2.8224 = 6835.75门。
5.2 功耗
Internal Switching Leakage Total
Power Group Power Power Power Power ( % ) Attrs
--------------------------------------------------------------------------------------------------
io_pad 0.0000 0.0000 0.0000 0.0000 ( 0.00%)
memory 0.0000 0.0000 0.0000 0.0000 ( 0.00%)
black_box 0.0000 0.0000 0.0000 0.0000 ( 0.00%)
clock_network 0.0000 0.0000 0.0000 0.0000 ( 0.00%)
register 0.0000 0.0000 0.0000 0.0000 ( 0.00%)
sequential 5.9101e-02 1.4190e-03 1.1083e+07 7.1603e-02 ( 77.31%)
combinational 3.7519e-03 8.4492e-03 8.8165e+06 2.1018e-02 ( 22.69%)
--------------------------------------------------------------------------------------------------
Total 6.2853e-02 mW 9.8682e-03 mW 1.9900e+07 pW 9.2621e-02 mW
Global Operating Voltage = 1
Power-specific unit information :
Voltage Units = 1V
Capacitance Units = 1.000000pf
Time Units = 1ns
Dynamic Power Units = 1mW (derived from V,C,T units)
Leakage Power Units = 1pW
Cell Internal Power = 62.8527 uW (86%)
Net Switching Power = 9.8682 uW (14%)
Total Dynamic Power = 72.7210 uW (100%)
Cell Leakage Power = 19.8999 uW
5.3 时序
Startpoint: u_core_bmu_instr_rsel_reg
(rising edge-triggered flip-flop)
Endpoint: bus_addr[17]
(output port)
Path Group: (none)
Path Type: max
Point Incr Path
--------------------------------------------------------------------------
u_core_bmu_instr_rsel_reg/CK (DFFRQX2) 0.00 0.00 r
u_core_bmu_instr_rsel_reg/Q (DFFRQX2) 0.34 0.34 r
U2342/Y (INVXL) 0.14 0.48 f
U2343/Y (AOI22XL) 0.30 0.78 r
U2366/Y (INVX1) 0.21 0.99 f
U2367/Y (NAND3XL) 0.08 1.07 r
U2368/Y (NOR4XL) 0.05 1.12 f
U3319/Y (NOR2XL) 0.08 1.20 r
U3320/Y (INVX1) 0.13 1.32 f
U3321/Y (AOI222XL) 0.16 1.49 r
U2210/CO (ADDFHXL) 0.13 1.61 r
U2211/CO (ADDFHXL) 0.15 1.76 r
U2237/CO (ADDFX1) 0.15 1.91 r
U2236/CO (ADDFX1) 0.15 2.05 r
U2235/CO (ADDFX1) 0.15 2.20 r
U2234/CO (ADDFX1) 0.15 2.35 r
U2233/CO (ADDFX1) 0.15 2.50 r
U2231/CO (ADDFX1) 0.15 2.65 r
U2230/CO (ADDFX1) 0.15 2.79 r
U2228/CO (ADDFX1) 0.15 2.94 r
U2227/CO (ADDFX1) 0.15 3.09 r
U2212/CO (ADDFHXL) 0.15 3.24 r
U2226/CO (ADDFX1) 0.15 3.38 r
U2225/CO (ADDFX1) 0.15 3.53 r
U2224/CO (ADDFX1) 0.15 3.68 r
U2223/CO (ADDFX1) 0.15 3.83 r
U2221/CO (ADDFX1) 0.15 3.98 r
U2220/CO (ADDFX1) 0.14 4.12 r
U4344/Y (XOR2XL) 0.07 4.19 r
U4346/Y (XOR2XL) 0.08 4.26 f
U4423/Y (OA22XL) 0.10 4.36 f
bus_addr[17] (out) 0.00 4.36 f
data arrival time 4.36
--------------------------------------------------------------------------
(Path is unconstrained)
在未经过约束的情况下,最大延迟路径为4.36ns,则最大频率为229MHz。实际上通过合理的约束,应该能跑的更高。
6.对比M0+
Arm Cortex®-M0+是能效最高的Arm处理器,以下是ARM M0+的内核参数,截取自ARM官方文档。
通过对比能够明晰,RISCV-K0A 在面积方面具备显著优势。在 90nm 工艺的条件下,RISCV-K0A 的面积与 ARM M0+最小配置的比值为 0.0193 / 0.0303,即 63.7%。而当 ARM M0+处于典型配置时,RISCV-K0A 的面积所占比例为 0.0193 / 0.0604,即 32.0%。这种对比清晰地展现了 RISCV-K0A 在面积上的优势地位。
关于功耗和速度的对比,需要PR后的数据才能够对比出来,暂时不具备条件。Benchmark的跑分对比,会在之后的仿真验证环节给出。
7.总结
至此,RISCV-K0A 的设计部分已然完结,后续的文章会介绍相关的验证及内核的benchmark测试。此内核是 RISC-V 指令集的一种相对简单、简洁的实现方式,适用于一些较为简单且对面积和功耗有着极高追求的场景。该内核并非致力于追求极致性能,秉持的原则是满足基本需求即可,在面积和功耗方面有着强烈的追求。另外,由于不支持压缩指令集,致使软件生成的代码体积偏大,所需的代码存储空间增多,这在一定程度上限制了它的应用场景。在一些大规模的计算任务中,其性能可能无法满足要求;在对代码体积有严格限制的场景中,也不太适用。所以,此内核较为适用的场景是在小容量的 FPGA 上使用,通常情况下,FPGA 上的 RAM 资源相对充裕,对于软件代码的体积并非十分敏感。