tinygpu是近2月由国外专业人士在github上发布的开源项目,是gpu功能的基本实现
一.sheduler模块
此模块用于控制整个gpu的模块运行流程
//状态机控制core_state
module scheduler #(
parameter THREADS_PER_BLOCK = 4,
) (
input wire clk,
input wire reset,
input wire start,
// Control Signals
input reg decoded_mem_read_enable,
input reg decoded_mem_write_enable,
input reg decoded_ret,
// Memory Access State
input reg [2:0] fetcher_state,
input reg [1:0] lsu_state [THREADS_PER_BLOCK-1:0],
// Current & Next PC
output reg [7:0] current_pc,
input reg [7:0] next_pc [THREADS_PER_BLOCK-1:0],
// Execution State
output reg [2:0] core_state,
output reg done
);
//状态机声明
localparam IDLE = 3'b000, // Waiting to start
FETCH = 3'b001, // Fetch instructions from program memory
DECODE = 3'b010, // Decode instructions into control signals
REQUEST = 3'b011, // Request data from registers or memory
WAIT = 3'b100, // Wait for response from memory if necessary
EXECUTE = 3'b101, // Execute ALU and PC calculations
UPDATE = 3'b110, // Update registers, NZP, and PC
DONE = 3'b111; // Done executing this block
always @(posedge clk) begin
if (reset) begin
current_pc <= 0;
core_state <= IDLE; //将core设定为初始状态
done <= 0;
end else begin
case (core_state)
IDLE: begin
// Here after reset (before kernel is launched, or after previous block has been processed)
if (start) begin
// Start by fetching the next instruction for this block based on PC
core_state <= FETCH; //将core设定到fetch状态,并控制对应模块运行
end
end
FETCH: begin
// Move on once fetcher_state = FETCHED
if (fetcher_state == 3'b010) begin //fetcher_state是scheduler模块控制的子模块反馈的信号,用于检测子模块是否运行完
core_state <= DECODE; //状态转换
end
end
DECODE: begin
// Decode is synchronous so we move on after one cycle
core_state <= REQUEST; //状态转换
end
REQUEST: begin
// Request is synchronous so we move on after one cycle
core_state <= WAIT; //状态转换
end
WAIT: begin
// Wait for all LSUs to finish their request before continuing
reg any_lsu_waiting = 1'b0;
for (int i = 0; i < THREADS_PER_BLOCK; i++) begin //遍历lsu_state
// Make sure no lsu_state = REQUESTING or WAITING
if (lsu_state[i] == 2'b01 || lsu_state[i] == 2'b10) begin //判断是否可以进行状态转换运行是否完成 //此信息也由子模块提供
any_lsu_waiting = 1'b1; //作为状态转换条件
break;
end
end
// If no LSU is waiting for a response, move onto the next stage
if (!any_lsu_waiting) begin //添加条件检测运行是否完成
core_state <= EXECUTE; //状态转换
end
end
EXECUTE: begin
// Execute is synchronous so we move on after one cycle
core_state <= UPDATE; //状态转换
end
UPDATE: begin
if (decoded_ret) begin //RST模块反馈的信息
// If we reach a RET instruction, this block is done executing
done <= 1;
core_state <= DONE; //转换到结束状态,跳出模块
end else begin
// TODO: Branch divergence. For now assume all next_pc converge
current_pc <= next_pc[THREADS_PER_BLOCK-1]; //地址指针增加
// Update is synchronous so we move on after one cycle
core_state <= FETCH; //转换状态进入循环
end
end
DONE: begin //此状态为跳出循环
// no-op
end
endcase
end
end
endmodule
二.fetcher模块
取值模块,取出输入的控制信号,输入decoder模块中进行译码
module fetcher #(
parameter PROGRAM_MEM_ADDR_BITS = 8,
parameter PROGRAM_MEM_DATA_BITS = 16
) (
input wire clk,
input wire reset,
// Execution State
input reg [2:0] core_state,
input reg [7:0] current_pc,
// Program Memory
output reg mem_read_valid,
output reg [PROGRAM_MEM_ADDR_BITS-1:0] mem_read_address,
input reg mem_read_ready,
input reg [PROGRAM_MEM_DATA_BITS-1:0] mem_read_data,
// Fetcher Output
output reg [2:0] fetcher_state,
output reg [PROGRAM_MEM_DATA_BITS-1:0] instruction, //输出16bit的信号
);
localparam IDLE = 3'b000, //开始状态 //设定状态机
FETCHING = 3'b001, //取值状态
FETCHED = 3'b010; //结束状态
always @(posedge clk) begin
if (reset) begin //定义各个变量初始值
fetcher_state <= IDLE;
mem_read_valid <= 0;
mem_read_address <= 0;
instruction <= {PROGRAM_MEM_DATA_BITS{1'b0}};
end else begin
case (fetcher_state)
IDLE: begin //开始状态
// Start fetching when core_state = FETCH
if (core_state == 3'b001) begin //开始取值
fetcher_state <= FETCHING; //状态转换
mem_read_valid <= 1; //拉高使能信号
mem_read_address <= current_pc; //取得取值对应的地址
end
end
FETCHING: begin //取值状态
// Wait for response from program memory
if (mem_read_ready) begin
fetcher_state <= FETCHED; //状态转换
instruction <= mem_read_data; // Store the instruction when received //取得被译码
mem_read_valid <= 0; //拉低使能信号
end
end
FETCHED: begin //结束状态
// Reset when core_state = DECODE
if (core_state == 3'b010) begin //开始译码,core_state改变,调用decoder模块
fetcher_state <= IDLE; //回到开始状态
end
end
endcase
end
end
endmodule
三.decoder模块
此模块用于对输入信号进行解码,并输出控制信号,对对应所需模块进行调用//此模块用于调控各个控制信号,用于控制算法模块进行计算
module decoder (
input wire clk,
input wire reset,
input reg [2:0] core_state,
input reg [15:0] instruction, //从fecher模块中输入
// Instruction Signals
output reg [3:0] decoded_rd_address,
output reg [3:0] decoded_rs_address,
output reg [3:0] decoded_rt_address,
output reg [2:0] decoded_nzp,
output reg [7:0] decoded_immediate,
// Control Signals
//这些控制信号都是脉冲(strobe),只有效一个周期
output reg decoded_reg_write_enable, // Enable writing to a register //启用寄存器
output reg decoded_mem_read_enable, // Enable reading from memory //读取数据
output reg decoded_mem_write_enable, // Enable writing to memory //启用记忆数据
output reg decoded_nzp_write_enable, // Enable writing to NZP register //启用比较寄存器
output reg [1:0] decoded_reg_input_mux, // Select input to register //选择输入寄存器(输出到寄存器文件中)
output reg [1:0] decoded_alu_arithmetic_mux, // Select arithmetic operation //选择算数
output reg decoded_alu_output_mux, // Select operation in ALU //选择输出算数结果
output reg decoded_pc_mux, // Select //对应基础算数模块中的算法开 source of next PC //地址改变
// Return (finished executing thread)
output reg decoded_ret
);
localparam NOP = 4'b0000, //地址加一
BRnzp = 4'b0001, //跳转指令
CMP = 4'b0010, //比较
ADD = 4'b0011, //加
SUB = 4'b0100, //减
MUL = 4'b0101, //乘
DIV = 4'b0110, //除
LDR = 4'b0111, //load
STR = 4'b1000, //赋值
CONST = 4'b1001, //地址赋予
RET = 4'b1111; //结束
always @(posedge clk) begin
if (reset) begin //初始化各个变量状态
decoded_rd_address <= 0;
decoded_rs_address <= 0;
decoded_rt_address <= 0;
decoded_immediate <= 0;
decoded_nzp <= 0;
decoded_reg_write_enable <= 0;
decoded_mem_read_enable <= 0;
decoded_mem_write_enable <= 0;
decoded_nzp_write_enable <= 0;
decoded_reg_input_mux <= 0;
decoded_alu_arithmetic_mux <= 0;
decoded_alu_output_mux <= 0;
decoded_pc_mux <= 0;
decoded_ret <= 0;
end else begin
// Decode when core_state = DECODE
if (core_state == 3'b010) begin //core_state==010时进行译码
// Get instruction signals from instruction every time
//接受分发instruction中的值 //这些数据用于控制下一个模块的运行
decoded_rd_address <= instruction[11:8];
decoded_rs_address <= instruction[7:4];
decoded_rt_address <= instruction[3:0];
decoded_immediate <= instruction[7:0];
decoded_nzp <= instruction[11:9];
// Control signals reset on every decode and set conditionally by instruction
//控制信号复位
decoded_reg_write_enable <= 0;
decoded_mem_read_enable <= 0;
decoded_mem_write_enable <= 0;
decoded_nzp_write_enable <= 0;
decoded_reg_input_mux <= 0;
decoded_alu_arithmetic_mux <= 0;
decoded_alu_output_mux <= 0;
decoded_pc_mux <= 0;
decoded_ret <= 0;
// Set the control signals for each instruction
case (instruction[15:12])
NOP: begin
// no-op
end
BRnzp: begin //控制跳转
decoded_pc_mux <= 1;
end
CMP: begin //控制比较
decoded_alu_output_mux <= 1; //输出开
decoded_nzp_write_enable <= 1; //比较开
end
ADD: begin
decoded_reg_write_enable <= 1; //寄存器开
decoded_reg_input_mux <= 2'b00; //0~12的用于ALU的寄存器工作
decoded_alu_arithmetic_mux <= 2'b00; //对应基础算数模块中的算法开
end
SUB: begin
decoded_reg_write_enable <= 1; //寄存器开
decoded_reg_input_mux <= 2'b00; //0~12的用于ALU的寄存器工作
decoded_alu_arithmetic_mux <= 2'b01; //对应基础算数模块中的算法开
end
MUL: begin
decoded_reg_write_enable <= 1; //寄存器开
decoded_reg_input_mux <= 2'b00; //0~12的用于ALU的寄存器工作
decoded_alu_arithmetic_mux <= 2'b10; //对应基础算数模块中的算法开
end
DIV: begin
decoded_reg_write_enable <= 1; //寄存器开
decoded_reg_input_mux <= 2'b00; //0~12的用于ALU的寄存器工作
decoded_alu_arithmetic_mux <= 2'b11; //对应基础算数模块中的算法开
end
LDR: begin
decoded_reg_write_enable <= 1; //寄存器开
decoded_reg_input_mux <= 2'b01; //用于LDR的寄存器工作
decoded_mem_read_enable <= 1; //从mem读取数据开
end
STR: begin
decoded_mem_write_enable <= 1; //数据储存开
end
CONST: begin
decoded_reg_write_enable <= 1; //寄存器开
decoded_reg_input_mux <= 2'b10; //用于CONST的寄存器工作
end
RET: begin
decoded_ret <= 1; //控制线程结束
end
endcase
end
end
end
endmodule
四.alu模块
此模块实现基本的算数功能
//基本算数逻辑单元
module alu (
input wire clk,
input wire reset,
input wire enable, // If current block has less threads then block size, some ALUs will be inactive
input reg [2:0] core_state,
input reg [1:0] decoded_alu_arithmetic_mux,
input reg decoded_alu_output_mux,
input reg [7:0] rs,
input reg [7:0] rt,
output wire [7:0] alu_out
);
//状态机,四种基本算数状态
localparam ADD = 2'b00,
SUB = 2'b01,
MUL = 2'b10,
DIV = 2'b11;
reg [7:0] alu_out_reg;
assign alu_out = alu_out_reg;
always @(posedge clk) begin
if (reset) begin
alu_out_reg <= 8'b0; //复位
end else if (enable) begin
// Calculate alu_out when core_state = EXECUTE
if (core_state == 3'b101) begin //判断core的运行状态,由sheduler提供
if (decoded_alu_output_mux == 1) begin
// Set values to compare with NZP register in alu_out[2:0]
//比较功能
alu_out_reg <= {5'b0, (rs - rt > 0), (rs - rt == 0), (rs - rt < 0)};
end else begin
// Execute the specified arithmetic instruction
//具体算法
case (decoded_alu_arithmetic_mux) //此信息由decoder模块提供
ADD: begin
alu_out_reg <= rs + rt;
end
SUB: begin
alu_out_reg <= rs - rt;
end
MUL: begin
alu_out_reg <= rs * rt;
end
DIV: begin
alu_out_reg <= rs / rt;
end
endcase
end
end
end
end
endmodule
五.core模块
用于立化调用gpucore中的各个功能模块
`default_nettype none
`timescale 1ns/1ns
// SCHEDULER
// > Manages the entire control flow of a single compute core processing 1 block
// 1. FETCH - Retrieve instruction at current program counter (PC) from program memory
// 2. DECODE - Decode the instruction into the relevant control signals
// 3. REQUEST - If we have an instruction that accesses memory, trigger the async memory requests from LSUs
// 4. WAIT - Wait for all async memory requests to resolve (if applicable)
// 5. EXECUTE - Execute computations on retrieved data from registers / memory
// 6. UPDATE - Update register values (including NZP register) and program counter
// > Each core has it's own scheduler where multiple threads can be processed with
// the same control flow at once.
// > Technically, different instructions can branch to different PCs, requiring "branch divergence." In
// this minimal implementation, we assume no branch divergence (naive approach for simplicity)
//状态机控制core_state
module scheduler #(
parameter THREADS_PER_BLOCK = 4,
) (
input wire clk,
input wire reset,
input wire start,
// Control Signals
input reg decoded_mem_read_enable,
input reg decoded_mem_write_enable,
input reg decoded_ret,
// Memory Access State
input reg [2:0] fetcher_state,
input reg [1:0] lsu_state [THREADS_PER_BLOCK-1:0],
// Current & Next PC
output reg [7:0] current_pc,
input reg [7:0] next_pc [THREADS_PER_BLOCK-1:0],
// Execution State
output reg [2:0] core_state,
output reg done
);
//状态机声明
localparam IDLE = 3'b000, // Waiting to start //开始状态
FETCH = 3'b001, // Fetch instructions from program memory //取指状态
DECODE = 3'b010, // Decode instructions into control signals //译码状态
REQUEST = 3'b011, // Request data from registers or memory //读取,写入memory状态
WAIT = 3'b100, // Wait for response from memory if necessary //等待memory工作状态
EXECUTE = 3'b101, // Execute ALU and PC calculations //计算并操作地址状态
UPDATE = 3'b110, // Update registers, NZP, and PC //nzp地址操作状态
DONE = 3'b111; // Done executing this block //复位状态
always @(posedge clk) begin
if (reset) begin
current_pc <= 0;
core_state <= IDLE; //将core设定为初始状态
done <= 0;
end else begin
case (core_state)
IDLE: begin
// Here after reset (before kernel is launched, or after previous block has been processed)
if (start) begin
// Start by fetching the next instruction for this block based on PC
core_state <= FETCH; //将core设定到fetch状态,并控制对应模块运行
end
end
FETCH: begin
// Move on once fetcher_state = FETCHED
if (fetcher_state == 3'b010) begin //fetcher_state是scheduler模块控制的子模块反馈的信号,用于检测子模块是否运行完
core_state <= DECODE; //状态转换
end
end
DECODE: begin
// Decode is synchronous so we move on after one cycle
core_state <= REQUEST; //状态转换
end
REQUEST: begin
// Request is synchronous so we move on after one cycle
core_state <= WAIT; //状态转换
end
WAIT: begin
// Wait for all LSUs to finish their request before continuing
reg any_lsu_waiting = 1'b0;
for (int i = 0; i < THREADS_PER_BLOCK; i++) begin //遍历lsu_state
// Make sure no lsu_state = REQUESTING or WAITING
if (lsu_state[i] == 2'b01 || lsu_state[i] == 2'b10) begin //判断是否可以进行状态转换运行是否完成 //此信息也由子模块提供
any_lsu_waiting = 1'b1; //作为状态转换条件
break;
end
end
// If no LSU is waiting for a response, move onto the next stage
if (!any_lsu_waiting) begin //添加条件检测运行是否完成
core_state <= EXECUTE; //状态转换
end
end
EXECUTE: begin
// Execute is synchronous so we move on after one cycle
core_state <= UPDATE; //状态转换
end
UPDATE: begin
if (decoded_ret) begin //RST模块反馈的信息
// If we reach a RET instruction, this block is done executing
done <= 1;
core_state <= DONE; //转换到结束状态,跳出模块
end else begin
// TODO: Branch divergence. For now assume all next_pc converge
current_pc <= next_pc[THREADS_PER_BLOCK-1]; //地址指针增加
// Update is synchronous so we move on after one cycle
core_state <= FETCH; //转换状态进入循环
end
end
DONE: begin //此状态为跳出循环
// no-op
end
endcase
end
end
endmodule
六.LSU模块
此模块用于加载和写入memory中的数据
`default_nettype none
`timescale 1ns/1ns
// LOAD-STORE UNIT
// > Handles asynchronous memory load and store operations and waits for response
// > Each thread in each core has it's own LSU
// > LDR, STR instructions are executed here
//加载保存单元
module lsu (
input wire clk,
input wire reset,
input wire enable, // If current block has less threads then block size, some LSUs will be inactive
// State
input reg [2:0] core_state,
// Memory Control Sgiansl
input reg decoded_mem_read_enable,
input reg decoded_mem_write_enable,
// Registers
input reg [7:0] rs,
input reg [7:0] rt,
// Data Memory
output reg mem_read_valid,
output reg [7:0] mem_read_address,
input reg mem_read_ready,
input reg [7:0] mem_read_data,
output reg mem_write_valid,
output reg [7:0] mem_write_address,
output reg [7:0] mem_write_data,
input reg mem_write_ready,
// LSU Outputs
output reg [1:0] lsu_state,
output reg [7:0] lsu_out
);
localparam IDLE = 2'b00,
REQUESTING = 2'b01,
WAITING = 2'b10,
DONE = 2'b11;
always @(posedge clk) begin
if (reset) begin //复位
lsu_state <= IDLE;
lsu_out <= 0;
mem_read_valid <= 0;
mem_read_address <= 0;
mem_write_valid <= 0;
mem_write_address <= 0;
mem_write_data <= 0;
end else if (enable) begin //LDR memory加载模块激活
// If memory read enable is triggered (LDR instruction)
if (decoded_mem_read_enable) begin //decoder模块提供的控制信号
case (lsu_state)
IDLE: begin //开始状态
// Only read when core_state = REQUEST
if (core_state == 3'b011) begin //scheduler提供的控制信号
lsu_state <= REQUESTING; //状态转换
end
end
REQUESTING: begin //运行状态
mem_read_valid <= 1; //使能信号拉高
mem_read_address <= rs; //rs值储存
lsu_state <= WAITING; //状态转换
end
WAITING: begin
if (mem_read_ready == 1) begin //储存完成
mem_read_valid <= 0;
lsu_out <= mem_read_data; //输出值
lsu_state <= DONE; //状态转换
end
end
DONE: begin
// Reset when core_state = UPDATE
if (core_state == 3'b110) begin //判断core_state还是不是出于LSU模块中
lsu_state <= IDLE; //状态转换循环运行
end
end
endcase
end
// If memory write enable is triggered (STR instruction)
//与上一段类似,读取变为储存
if (decoded_mem_write_enable) begin //STR memory写入模块激活
case (lsu_state)
IDLE: begin
// Only read when core_state = REQUEST
if (core_state == 3'b011) begin
lsu_state <= REQUESTING;
end
end
REQUESTING: begin
mem_write_valid <= 1;
mem_write_address <= rs;
mem_write_data <= rt;
lsu_state <= WAITING;
end
WAITING: begin
if (mem_write_ready) begin
mem_write_valid <= 0;
lsu_state <= DONE;
end
end
DONE: begin
// Reset when core_state = UPDATE
if (core_state == 3'b110) begin
lsu_state <= IDLE;
end
end
endcase
end
end
end
endmodule