FPGA完成CNN卷积层

Sliver Wings

已于 2022-12-03 19:20:20 修改

阅读量503

点赞数 1

文章标签： fpga开发 cnn 人工智能

于 2022-12-01 22:44:27 首次发布

本文链接：https://blog.csdn.net/monkeynight/article/details/128104894

版权

1卷积层

顶层设计

	输入图片和，32*32*16和滤波器，5*5*6*16，输出位28*28*16

在这里插入图片描述

卷积部分

卷积部分无非就是数据的相乘和相加，设计好乘法器和加法器并保存得到的结果（16）。考虑到数据精度采用半精度浮点数乘法器和半精度浮点数加法器。[半精度浮点数详解]，定点数量化，减少存取数据和访问存储器的时间，并且不会过分减少精度。(https://blog.csdn.net/Miracle_cx/article/details/89463503)
举例一个平常的数，这次反过来，如-1.5625×10^(-1)
即-0.15625 = -0.00101（十进制转二进制）= -1.01×2^(-3)
所以符号位为1，指数为-3+15=12，所以指数位为01100，尾数位为0100000000。所以-1.5625×10^(-1)用半精度浮点数表示就为1 01100 0100000000。注：小数部分二进制每次乘以2取整。

module floatAdd (
	input 	wire [15:0] floatA,
	input 	wire [15:0] floatB,
	output	reg	 [15:0] sum
);

reg sign; // 输出结果的正负标志位
reg signed [5:0] exponent; //输出数据的指数，因为有正负所以选择有符号数
reg [9:0] mantissa; // 输出数据的尾数
reg [4:0] exponentA, exponentB; //输入数据的阶数
reg [10:0] fractionA, fractionB, fraction;	// 计算暂存位
reg [7:0] shiftAmount; 	// 移位寄存器，为了计算加法时配平阶数
reg cout;

always @ (floatA or floatB) 
begin
	exponentA = floatA[14:10];
	exponentB = floatB[14:10];
	fractionA = {1'b1,floatA[9:0]};
	fractionB = {1'b1,floatB[9:0]}; 
	
	exponent = exponentA;

	if (floatA == 0) 		// 特殊情况A为0
	begin						
		sum = floatB;
	end 
	else if (floatB == 0)  // 特殊情况B为0
	begin					
		sum = floatA;
	end 
	else if (floatA[14:0] == floatB[14:0] && floatA[15]^floatB[15]==1'b1) //特殊情况互为相反数
	begin
		sum=0;
	end 
	else 
	begin
		if (exponentB > exponentA)  // 配平阶数使得相加两数在同一阶数上
		begin
			shiftAmount = exponentB - exponentA;
			fractionA = fractionA >> (shiftAmount);
			exponent = exponentB;
		end 
		else if (exponentA > exponentB) 
		begin 
			shiftAmount = exponentA - exponentB;
			fractionB = fractionB >> (shiftAmount);
			exponent = exponentA;
		end
		if (floatA[15] == floatB[15]) 	// 两数同号
		begin							
			{cout,fraction} = fractionA + fractionB;
			if (cout == 1'b1) 
			begin
				{cout,fraction} = {cout,fraction} >> 1;
				exponent = exponent + 1;
			end
			sign = floatA[15];
		end 
		else 
		begin						//两数异号
			if (floatA[15] == 1'b1) // A 为负数
			begin
				{cout,fraction} = fractionB - fractionA;	// B-A
			end 
			else 
			begin
				{cout,fraction} = fractionA - fractionB;	// A-B
			end
			sign = cout;
			if (cout == 1'b1) 
				fraction = -fraction; // 0-负数可求出此数的绝对值
			// 对franction进行阶数配平求出尾数
			if (fraction [10] == 0) begin
				if (fraction[9] == 1'b1) begin
					fraction = fraction << 1;
					exponent = exponent - 1;
				end else if (fraction[8] == 1'b1) begin
					fraction = fraction << 2;
					exponent = exponent - 2;
				end else if (fraction[7] == 1'b1) begin
					fraction = fraction << 3;
					exponent = exponent - 3;
				end else if (fraction[6] == 1'b1) begin
					fraction = fraction << 4;
					exponent = exponent - 4;
				end else if (fraction[5] == 1'b1) begin
					fraction = fraction << 5;
					exponent = exponent - 5;
				end else if (fraction[4] == 1'b1) begin
					fraction = fraction << 6;
					exponent = exponent - 6;
				end else if (fraction[3] == 1'b1) begin
					fraction = fraction << 7;
					exponent = exponent - 7;
				end else if (fraction[2] == 1'b1) begin
					fraction = fraction << 8;
					exponent = exponent - 8;
				end else if (fraction[1] == 1'b1) begin
					fraction = fraction << 9;
					exponent = exponent - 9;
				end else if (fraction[0] == 1'b1) begin
					fraction = fraction << 10;
					exponent = exponent - 10;
				end 
			end
		end
		mantissa = fraction[9:0];
		if(exponent[5]==1'b1) begin //太小了输出全0太小了
			sum = 16'b0000000000000000;
		end
		else begin
			sum = {sign,exponent[4:0],mantissa}; // 组合数据
		end		
	end		
end

endmodule

测试代码

`timescale 100 ns / 10 ps

module floatAdd_TB ();

reg [15:0] floatA;
reg [15:0] floatB;
wire [15:0] sum;

initial begin
	
	// 0.3 + 0.2
	#0
	floatA = 16'h34CD;
	floatB = 16'h3266;

	// 0.3 + 0
	#10
	floatA = 16'h34CD;
	floatB = 16'h0000;
	#10
	$stop;
end

floatAdd FADD
(
	.floatA(floatA),
	.floatB(floatB),
	.sum(sum)
);

endmodule

在这里插入图片描述
这里选择0.3+0.2，与0.3+0，对应二进制可以运算，结果无误

module floatMuilt 
(
	input wire [15:0] floatA,
	input wire [15:0] floatB,
	output reg [15:0] product
);

reg sign; // 输出的正负标志位
reg signed [5:0] exponent; // 输出数据的指数，因为有正负所以选择有符号数
reg [9:0] mantissa; // 输出数据的小数
reg [10:0] fractionA, fractionB;	//fraction = {1,mantissa} // 计算二进制数据最高位补1
reg [21:0] fraction; // 相乘结果参数


always @ (floatA or floatB) 
begin
	if (floatA == 0 || floatB == 0)  // 处理乘数有一个或者两个均为0的情况
		product = 0;				//  输出为0
	else 
	begin
		sign = floatA[15] ^ floatB[15]; // 异或门判断输出的计算正负
		exponent = floatA[14:10] + floatB[14:10] - 5'd15 + 5'd2; // 由于借位给fractionA和fractionB需要先补齐两位指数
	
		fractionA = {1'b1,floatA[9:0]}; //借位给fractionA
		fractionB = {1'b1,floatB[9:0]}; //借位给fractionB
		fraction = fractionA * fractionB; //计算二进制乘法
		// 找到第一个不为0的数字并对指数进行匹配处理
		if (fraction[21] == 1'b1) 
		begin
			fraction = fraction << 1;
			exponent = exponent - 1; 
		end 
		else if (fraction[20] == 1'b1) 
		begin
			fraction = fraction << 2;
			exponent = exponent - 2;
		end 
		else if (fraction[19] == 1'b1) 
		begin
			fraction = fraction << 3;
			exponent = exponent - 3;
		end 
		else if (fraction[18] == 1'b1) 
		begin
			fraction = fraction << 4;
			exponent = exponent - 4;
		end 
		else if (fraction[17] == 1'b1) 
		begin
			fraction = fraction << 5;
			exponent = exponent - 5;
		end 
		else if (fraction[16] == 1'b1) 
		begin
			fraction = fraction << 6;
			exponent = exponent - 6;
		end 
		else if (fraction[15] == 1'b1) 
		begin
			fraction = fraction << 7;
			exponent = exponent - 7;
		end 
		else if (fraction[14] == 1'b1) 
		begin
			fraction = fraction << 8;
			exponent = exponent - 8;
		end 
		else if (fraction[13] == 1'b1) 
		begin
			fraction = fraction << 9;
			exponent = exponent - 9;
		end 
		else if (fraction[12] == 1'b0) 
		begin
			fraction = fraction << 10;
			exponent = exponent - 10;
		end 
		// 按照半精度浮点数的格式输出
		mantissa = fraction[21:12];
		if(exponent[5]==1'b1) begin //太小了输出全0(精度问题)
			product=16'b0000000000000000;
		end
		else begin
			product = {sign,exponent[4:0],mantissa}; //拼接输出数据
		end
	end
end

测试代码

`timescale 100 ns / 10 ps

module floatMult_TB ();

reg [15:0] floatA;
reg [15:0] floatB;
wire [15:0] product;

initial begin
	
	// 4 * 5
	#0
	floatA = 16'b0100010000000000;
	floatB = 16'b0100010100000000;

	// 0.0004125 * 0
	#10
	floatA = 16'b0000111011000010;
	floatB = 16'b0000000000000000;

	#10
	$stop;
end

floatMult FM
(
	.floatA(floatA),
	.floatB(floatB),
	.product(product)
);

endmodule

在这里插入图片描述
这里采用4*5，与一个很小的数与0相乘，结果无误。

将计数一次得到的结果存储下来，用于后面的卷积，这里不做演示。

`timescale 100 ns / 10 ps

module processingElement(clk,reset,floatA,floatB,result);

parameter DATA_WIDTH = 16;

input clk, reset;
input [DATA_WIDTH-1:0] floatA, floatB;
output reg [DATA_WIDTH-1:0] result;

wire [DATA_WIDTH-1:0] multResult;
wire [DATA_WIDTH-1:0] addResult;

floatMult FM (floatA,floatB,multResult);
floatAdd FADD (multResult,result,addResult);

always @ (posedge clk or posedge reset) begin
	if (reset == 1'b1) begin
		result = 0;
	end else begin
		result = addResult;
	end
end

endmodule

最后通过循环完成一整个卷积。

`timescale 100 ns / 10 ps

module convUnit(clk,reset,image,filter,result);

parameter DATA_WIDTH = 16;
parameter D = 1; //depth of the filter
parameter F = 5; //size of the filter

input clk, reset;
input [0:D*F*F*DATA_WIDTH-1] image, filter;
output [0:DATA_WIDTH-1] result;

reg [DATA_WIDTH-1:0] selectedInput1, selectedInput2;

integer i;


processingElement PE
	(
		.clk(clk),
		.reset(reset),
		.floatA(selectedInput1),
		.floatB(selectedInput2),
		.result(result)
	);

// The convolution is calculated in a sequential process to save hardware
// The result of the element wise matrix multiplication is finished after (F*F+2) cycles (2 cycles to reset the processing element and F*F cycles to accumulate the result of the F*F multiplications) 
always @ (posedge clk, posedge reset) begin
	if (reset == 1'b1) begin // reset
		i = 0;
		selectedInput1 = 0;
		selectedInput2 = 0;
	end else if (i > D*F*F-1) begin // if the convolution is finished but we still wait for other blocks to finsih, send zeros to the conv unit (in case of pipelining)
		selectedInput1 = 0;
		selectedInput2 = 0;
	end else begin // send one element of the image part and one element of the filter to be multiplied and accumulated
		selectedInput1 = image[DATA_WIDTH*i+:DATA_WIDTH];
		selectedInput2 = filter[DATA_WIDTH*i+:DATA_WIDTH];
		i = i + 1;
	end
end

endmodule

过滤器

在进行卷积之前需要将处理的数据提取出来，比如55的矩阵，1-25的依次排列，过滤器选择22，就需要将1，2，6，7提取出来。

`timescale 100 ns / 10 ps

//this modules takes as inputs the image, a row number and a column number
//it fills the output array with matrices of the parts of the image to be sent to the conv units

module RFselector(image,rowNumber, column,receptiveField);

parameter DATA_WIDTH = 16;
parameter D = 1; //Depth of the filter
parameter H = 32; //Height of the image
parameter W = 32; //Width of the image
parameter F = 5; //Size of the filter

input [0:D*H*W*DATA_WIDTH-1] image;
input [5:0] rowNumber, column;
output reg [0:(((W-F+1)/2)*D*F*F*DATA_WIDTH)-1] receptiveField; //array to hold the matrices (parts of the image) to be sent to the conv units

//address: counter to fill the receptive filed array
//c: counter to loop on the columns of the input image
//k: counter to loop on the depth of the input image
//i: counter to loop on the rows of the input image
integer address, c, k, i;

always @ (image or rowNumber or column) begin
	address = 0;
	if (column == 0) begin //if the column is zero fill the array with the parts of the image correspoding to the first half of pixels of the row (with rowNumber) of the output image
		for (c = 0; c < (W-F+1)/2; c = c + 1) begin
			for (k = 0; k < D; k = k + 1) begin
				for (i = 0; i < F; i = i + 1) begin
					receptiveField[address*F*DATA_WIDTH+:F*DATA_WIDTH] = image[rowNumber*W*DATA_WIDTH+c*DATA_WIDTH+k*H*W*DATA_WIDTH+i*W*DATA_WIDTH+:F*DATA_WIDTH];
					address = address + 1;
				end
			end
		end
	end else begin //if the column is zero fill the array with the parts of the image correspoding to the second half of pixels of the row (with rowNumber) of the output image
		for (c = (W-F+1)/2; c < (W-F+1); c = c + 1) begin
			for (k = 0; k < D; k = k + 1) begin
				for (i = 0; i < F; i = i + 1) begin
					receptiveField[address*F*DATA_WIDTH+:F*DATA_WIDTH] = image[rowNumber*W*DATA_WIDTH+c*DATA_WIDTH+k*H*W*DATA_WIDTH+i*W*DATA_WIDTH+:F*DATA_WIDTH];
					address = address + 1;
				end
			end
		end
	end
	
end

endmodule

接下来就需要选择过滤器的移动来完成一整个卷积层。

`timescale 100 ns / 10 ps

module convLayerSingle(clk,reset,image,filter,outputConv);

parameter DATA_WIDTH = 16;
parameter D = 1; //Depth of the filter
parameter H = 32; //Height of the image
parameter W = 32; //Width of the image
parameter F = 5; //Size of the filter

input clk, reset;
input [0:D*H*W*DATA_WIDTH-1] image;
input [0:D*F*F*DATA_WIDTH-1] filter;
output reg [0:(H-F+1)*(W-F+1)*DATA_WIDTH-1] outputConv; // output of the module

wire [0:((W-F+1)/2)*DATA_WIDTH-1] outputConvUnits; // output of the conv units and input to the row selector

reg internalReset;
wire [0:(((W-F+1)/2)*D*F*F*DATA_WIDTH)-1] receptiveField; // array of the matrices to be sent to conv units


integer counter, outputCounter;
//counter: number of clock cycles need for the conv unit to finsish
//outputCounter: index to map the output of the conv units to the output of the module

reg [5:0] rowNumber, column; 
//rowNumber: determines the row that is calculated by the conv units
//column: determines if we are calculating the first or the second 14 pixels of the output row

RFselector
#(
	.DATA_WIDTH(DATA_WIDTH),
	.D(D),
	.H(H),
	.W(W),
	.F(F)
) RF
(
	.image(image),
	.rowNumber(rowNumber),
	.column(column),
	.receptiveField(receptiveField)
);

genvar n;

generate //generating n convolution units where n is half the number of pixels in one row of the output image
	for (n = 0; n < (H-F+1)/2; n = n + 1) 
	begin: convLayerSingle
		convUnit
		#(
			.D(D),
			.F(F)
		) CU
		(
			.clk(clk),
			.reset(internalReset),
			.image(receptiveField[n*D*F*F*DATA_WIDTH+:D*F*F*DATA_WIDTH]),
			.filter(filter),
			.result(outputConvUnits[n*DATA_WIDTH+:DATA_WIDTH])
		);
	end
endgenerate

always @ (posedge clk or posedge reset) begin
	if (reset == 1'b1) begin
		internalReset = 1'b1;
		rowNumber = 0;
		column = 0;
		counter = 0;
		outputCounter = 0;
	end else if (rowNumber < H-F+1) begin
		if (counter == D*F*F+2) begin //The conv unit finishes ater 1*5*5+2 clock cycles
			outputCounter = outputCounter + 1;
			counter = 0;
			internalReset = 1'b1;
			if (column == 0) begin
				column = (H-F+1)/2;
			end else begin
				rowNumber = rowNumber + 1;
				column = 0;
			end
		end else begin
			internalReset = 0;
			counter = counter + 1;
		end
	end
end

always @ (*) begin // connecting the output of the conv units with the output of the module
	outputConv[outputCounter*((W-F+1)/2)*DATA_WIDTH+:((W-F+1)/2)*DATA_WIDTH] = outputConvUnits;
end

endmodule

Sliver Wings

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
1
评论
FPGA完成CNN卷积层

所以符号位为1，指数为-3+15=12，所以指数位为01100，尾数位为0100000000。卷积部分无非就是数据的相乘和相加，设计好乘法器和加法器并保存得到的结果（16）。这里选择0.3+0.2，与0.3+0，对应二进制可以运算，结果无误。举例一个平常的数，这次反过来，如-1.5625×10^(-1)将计数一次得到的结果存储下来，用于后面的卷积，这里不做演示。这里采用4*5，与一个很小的数与0相乘，结果无误。5的矩阵，1-25的依次排列，过滤器选择2。2，就需要将1，2，6，7提取出来。
复制链接

扫一扫