卷积

最新推荐文章于 2021-10-24 20:30:15 发布

qq_26697045

最新推荐文章于 2021-10-24 20:30:15 发布

阅读量398

点赞数

分类专栏： CUDA 编程

本文链接：https://blog.csdn.net/qq_26697045/article/details/119064419

版权

CUDA 编程专栏收录该内容

4 篇文章 0 订阅

订阅专栏

欢迎访问我的博客首页。

卷积

1. 单通道卷积
2. 三维数组
3. 多通道卷积
4. 参考

1. 单通道卷积

把卷积步长设置为 1 的目的是保持特征宽高不变，假设卷积核尺寸为 (2m+1, 2n+1)，则还需要在左右各填充 m 行，上下各填充 n 行，这样卷积前后特征的宽高尺寸就相等了。
下面使用 CUDA 实现卷积：步长只能为 1，输入、输出的通道也都只能为 1；特征尺寸和卷积核尺寸可以任意指定。

1. 核函数

__global__ static void conv2d_stride1(float* output, const float* input, const float* kernel,
	const size_t h_input, const size_t w_input, const size_t h_kernel, const size_t w_kernel) {
	const size_t thread_x = blockIdx.x * blockDim.x + threadIdx.x;
	const size_t thread_y = blockIdx.y * blockDim.y + threadIdx.y;
	if (thread_x >= w_input || thread_y >= h_input)
		return;
	output[thread_y * w_input + thread_x] = 0;
	int x_start = thread_x - (w_kernel - 1) / 2, x_input;
	int y_start = thread_y - (h_kernel - 1) / 2, y_input;
	for (size_t i = 0; i < h_kernel; i++) {
		y_input = y_start + i;
		for (size_t j = 0; j < w_kernel; j++) {
			x_input = x_start + j;
			if (x_input >= 0 && x_input < w_input && y_input >= 0 && y_input < h_input)
				output[thread_y * w_input + thread_x] += kernel[i * w_kernel + j] * input[y_input * w_input + x_input];
			// else 感受野出界，卷积结果为0，不需要累加。
		}
	}
}

第 3、4 行获取线程的二维坐标。
第 5、6 行是不可缺少的。因为取整的原因，在核函数模板参数中设置的 grid 和 block 各维度长度往往大于实际需要。必须过滤掉多余线程，否则可能会有无效的 output[x2] 覆盖掉有效的 output[x1]。比如 output[3 * 5 + 0] 会覆盖 output[3 * 4 + 3]。
第 7 行 w_input 是线程在 x 维度上的有长度。
第 8、9 行：坐标为（thread_x，thread_y）的线程负责计算出输出特征图的坐标为（thread_x，thread_y）的像素值，那么它操作的卷积核的中心也就与输入特征图的坐标为（thread_x，thread_y）的像素重叠，所以它从输入特征图的（x_start，y_start）开始卷积。

2. 使用 C++ 调用核函数

#include <iostream>
#include <iomanip>
#include <stdlib.h>
#include <time.h>
#include <cmath>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
using namespace std;

void generate_mat(float* mat, size_t h, size_t w) {
	for (size_t i = 0; i < h; i++)
		for (size_t j = 0; j < w; j++)
			mat[i * w + j] = i * w + j + 1;
}

void print(const char* name, const float* mat, size_t h, size_t w) {
	for (size_t i = 0; i < h; i++) {
		for (size_t j = 0; j < w; j++)
			cout << setw(3) << mat[i * w + j] << " ";
		cout << endl;
	}
	cout << endl;
}

int main() {
	// 1.选择显卡。
	int count;
	cudaGetDeviceCount(&count);
	if (count == 0) {
		fprintf(stderr, "There is no device.\n");
		return 0;
	}
	cudaSetDevice(0);
	// 2.产生输入数据。
	size_t h_input = 3, w_input = 20, h_kernel = 3, w_kernel = 3;
	float* mat_input = (float*)malloc(sizeof(float) * h_input * w_input);
	float* mat_kernel = (float*)malloc(sizeof(float) * h_kernel * w_kernel);
	float* mat_output = (float*)malloc(sizeof(float) * h_input * w_input);
	generate_mat(mat_input, h_input, w_input);
	generate_mat(mat_kernel, h_kernel, w_kernel);
	// 3.申请显寸存放输入输出，并把数据从内存拷贝到显存。
	float* gpu_input, *gpu_kernel, *gpu_output;
	cudaMalloc((void**)&gpu_input, sizeof(float) * h_input * w_input);
	cudaMalloc((void**)&gpu_kernel, sizeof(float) * h_kernel * w_kernel);
	cudaMalloc((void**)&gpu_output, sizeof(float) * h_input * w_input);
	cudaMemcpy(gpu_input, mat_input, sizeof(float) * h_input * w_input, cudaMemcpyHostToDevice);
	cudaMemcpy(gpu_kernel, mat_kernel, sizeof(float) * h_kernel * w_kernel, cudaMemcpyHostToDevice);
	// 4.调用核函数：<<<grid维度, block维度, shared memory字节数>>>(参数...)。
	size_t thread_total = h_input * w_input;
	// 4.1 设置Block维度。
	size_t thread_num_per_block = thread_total < 1024 ? thread_total : 1024;
	size_t thread_num_sqrt_per_block = ceil(sqrt(thread_num_per_block));
	dim3 dimBlock(thread_num_sqrt_per_block, thread_num_sqrt_per_block);
	// 4.2 设置Grid维度。
	size_t w_grid = ceil(1.0 * w_input / thread_num_sqrt_per_block);
	size_t h_grid = ceil(1.0 * h_input / thread_num_sqrt_per_block);
	dim3 dimGrid(w_grid, h_grid);
	cout << "Grid:(" << dimGrid.x << ", " << dimGrid.y << ", " << dimGrid.z << "), Block:(" << dimBlock.x << ", " << dimBlock.y << ", " << dimBlock.z << ")." << endl << endl;
	// 4.3 调用核函数。
	conv2d_stride1 << <dimBlock, dimGrid, 0 >> >(gpu_output, gpu_input, gpu_kernel, h_input, w_input, h_kernel, w_kernel);
	// 5.把结果从显存复制到内存，并释放显存。
	cudaMemcpy(mat_output, gpu_output, sizeof(float) * h_input * w_input, cudaMemcpyDeviceToHost);
	cudaFree(gpu_input);
	cudaFree(gpu_kernel);
	cudaFree(gpu_output);
	// 6.处理结果，释放内存。
	print("mat_input", mat_input, h_input, w_input);
	print("mat_kernel", mat_kernel, h_kernel, w_kernel);
	print("mat_output", mat_output, h_input, w_input);
	free(mat_input);
	free(mat_kernel);
	free(mat_output);
	system("pause");
}

第 10 至 23 行：二维矩阵依然存储在一维数组中，工具函数用于生成和输出矩阵。

主函数第 2 部分指定输入特征和卷积核的尺寸。
第 4 部分确定 Grid 和 Block 维度。输出特征的尺寸是 $thread\_total = h\_input \times w\_input$ ，我们就使用 $thread\_total$ 个线程。这样就不用滑动卷积核了，每个线程负责计算输出特征图上的一个像素值。我们要让线程的维度与输出特征的维度一一对应，也就是说线程的两个维度要等于输出特征的两个维度。实际上由于对 block 取整，线程的两个维度会大于等于输出特征的两个维度。
因为线程的维度与输出特征的维度一一对应，我们把特征的维度传递给核函数，就不需要再传递线程的维度了。注意第 33 行 w_grid 和 h_grid 的前后顺序，这样存放可以让线程的坐标与特征的坐标也一一对应。线程的坐标在下面核函数的第 3、4 行。

block 的维度。block 的维度很简单，我们让它两个维度相等。如第 27 行，如果线程总数小于 1024，只需要 1 个 block 就行了；如果线程总数大于 1024，每个 block 内分配 1024 个线程。
grid 的维度。为了保证 “线程的两个维度大于等于输出特征的两个维度”，grid 的 x 维度长度等于输出特征的宽除以 block 的 x 维度长度；grid 的 y 维度长度等于输出特征的高除以 block 的 y 维度长度。如第 31、32 行。

注意：ceil 函数中如果是两个整数相除，需要先把它转化成浮点数，不然 ceil(10 / 3) = ceil(3) = 3。

3. 使用 PyCUDA 调用核函数

import cv2
import math
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule

mod = SourceModule("""
__global__ void conv2d_stride1(float* output, const float* input, const float* kernel,
	const size_t h_input, const size_t w_input, const size_t h_kernel, const size_t w_kernel) {
	const size_t thread_x = blockIdx.x * blockDim.x + threadIdx.x;
	const size_t thread_y = blockIdx.y * blockDim.y + threadIdx.y;
	if (thread_x >= w_input || thread_y >= h_input)
		return;
	output[thread_y * w_input + thread_x] = 0;
	int x_start = thread_x - (w_kernel - 1) / 2, x_input;
	int y_start = thread_y - (h_kernel - 1) / 2, y_input;
	for (size_t i = 0; i < h_kernel; i++) {
		y_input = y_start + i;
		for (size_t j = 0; j < w_kernel; j++) {
			x_input = x_start + j;
			if (x_input >= 0 && x_input < w_input && y_input >= 0 && y_input < h_input)
				output[thread_y * w_input + thread_x] += kernel[i * w_kernel + j] * input[y_input * w_input + x_input];
			// else 感受野出界，卷积结果为0，不需要累加。
		}
	}
}
""")

if __name__ == '__main__':
    conv2d_stride1 = mod.get_function("conv2d_stride1")
    # 1.准备数据。
    image = cv2.imread('1.jpg')[:, :, 0].astype(np.float32)
    h_input, w_input = image.shape
    h_kernel, w_kernel = 3, 3
    kernel = np.array([i for i in range(h_kernel * w_kernel)]).astype(np.float32).reshape(h_kernel, w_kernel)
    dest = np.zeros((h_input, w_input), dtype=np.float32)
    # 2.确定grid和blick各维度的长度。
    thread_total = h_input * w_input
    thread_num = thread_total if thread_total < 1024 else 1024
    blockDim_xy = math.ceil(math.sqrt(thread_num))
    gridDim_y = math.ceil(h_input / blockDim_xy)
    gridDim_x = math.ceil(w_input / blockDim_xy)
    # 3.调用核函数
    conv2d_stride1(cuda.Out(dest), cuda.In(image), cuda.In(kernel),
                   np.uint64(h_input), np.uint64(w_input), np.uint64(h_kernel), np.uint64(w_kernel),
                   block=(blockDim_xy, blockDim_xy, 1), grid=(gridDim_x, gridDim_y))
    print(dest.shape)
    print(dest.sum())

4. 使用 PyTorch 卷积

import cv2
import torch
import numpy as np

if __name__ == '__main__':
    conv = torch.nn.Conv2d(in_channels=1, out_channels=1, kernel_size=3, stride=1, padding=1, bias=False)
    conv.weight = torch.nn.Parameter(torch.tensor([i for i in range(9)], dtype=torch.float32).reshape(1, 1, 3, 3))
    image = torch.tensor(cv2.imread('1.jpg')[:, :, 0].astype(np.float32)).reshape(1, 1, 112, 112)
    dest = conv(image)
    print(dest.shape)
    print(dest.sum())

2. 三维数组

使用一维数组存放数据，模仿三维数组。

const int h = 2, w = 3, c = 4;
const int len = h * w * c;
int a[len];
for (int i = 0; i < len; i++)
	a[i] = i;
for (int i = 0; i < h; i++)
	for (int j = 0; j < w; j++)
		for (int k = 0; k < c; k++)
			cout << a[i * w * c + j * c + k] << " ";

3. 多通道卷积

1. 使用 PyCUDA 调用核函数

import cv2
import math
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule

mod = SourceModule("""
__global__ void conv2d_stride1(long long* output, const long long* input, const long long* kernel,
	const size_t h_input, const size_t w_input, const size_t h_kernel, const size_t w_kernel, const size_t c) {
	const size_t thread_x = blockIdx.x * blockDim.x + threadIdx.x;
	const size_t thread_y = blockIdx.y * blockDim.y + threadIdx.y;
	if (thread_x >= w_input || thread_y >= h_input)
		return;
	output[thread_y * w_input + thread_x] = 0;
	int x_start = thread_x - (w_kernel - 1) / 2, x_input;
	int y_start = thread_y - (h_kernel - 1) / 2, y_input;
	for (size_t i = 0; i < h_kernel; i++) {
		y_input = y_start + i;
		for (size_t j = 0; j < w_kernel; j++) {
			x_input = x_start + j;
			if (x_input >= 0 && x_input < w_input && y_input >= 0 && y_input < h_input)
				for (size_t k = 0; k < c; k++)
					output[thread_y * w_input + thread_x] += kernel[i * w_kernel * c + j * c + k] * input[y_input * w_input * c + x_input * c + k];
			// else 感受野出界，卷积结果为0，不需要累加。
		}
	}
}
""")

if __name__ == '__main__':
    conv2d_stride1 = mod.get_function("conv2d_stride1")
    # 1.准备数据。
    image = cv2.imread('3.jpg').astype(np.int64)
    h_input, w_input, c = image.shape
    h_kernel, w_kernel = 3, 3
    kernel = np.array([i for i in range(h_kernel * w_kernel * c)]).astype(np.int64).reshape(h_kernel, w_kernel, c)
    dest = np.zeros((h_input, w_input), dtype=np.int64)
    # 2.确定grid和blick各维度的长度。
    thread_total = h_input * w_input
    thread_num = thread_total if thread_total < 1024 else 1024
    blockDim_xy = math.ceil(math.sqrt(thread_num))
    gridDim_y = math.ceil(h_input / blockDim_xy)
    gridDim_x = math.ceil(w_input / blockDim_xy)
    # 3.调用核函数
    conv2d_stride1(cuda.Out(dest), cuda.In(image), cuda.In(kernel),
                   np.uint64(h_input), np.uint64(w_input), np.uint64(h_kernel), np.uint64(w_kernel), np.uint64(c),
                   block=(blockDim_xy, blockDim_xy, 1), grid=(gridDim_x, gridDim_y))
    print(dest.shape)
    print(dest.sum())

卷积核大小固定为 3，步长固定为 1。实际上卷积核边长可以是任意奇数 (2m+1, 2n + 1)。我们实现的卷积默认上下边界各填充 m 行 0，左右边界各填充 n 行 0。

2. 使用 PyTorch 卷积

import cv2
import torch

torch.set_printoptions(sci_mode=False)

if __name__ == '__main__':
    # 1.准备数据。
    image = cv2.imread('3.jpg')
    h_input, w_input, c = image.shape
    image = torch.tensor(image, dtype=torch.int64).reshape(1, h_input, w_input, c)
    image = image.permute(0, 3, 1, 2)
    # 2.准备卷积。
    weight = torch.tensor([i for i in range(3 * 3 * c)], dtype=torch.int64).reshape(1, 3, 3, c)
    weight = weight.permute(0, 3, 1, 2)
    conv = torch.nn.Conv2d(in_channels=3, out_channels=1, kernel_size=3, stride=1, padding=1, bias=False)
    conv.weight = torch.nn.Parameter(weight, requires_grad=False)
    # 3.运算。
    dest = conv(image)
    print(dest.shape)
    print(dest.sum())

注意第 11 行和 14 行的维度顺序。对 RGB 通道的图像而言，我们实现的卷积的卷积核也是三通道，第 1 个通道卷积 R 通道，第 2 个通道卷积 G 通道。
多个 float 类型相乘相加，即使小数部分都是 0 也有可能出现精度损失。为了比较我们实现的卷积和 PyTorch 的卷积，我们让双方的数据都设置为 64 位的 int 类型。

4. 参考

平面卷积

qq_26697045

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
卷积

平面一步长卷积1. 平面一步长卷积2. 参考1. 平面一步长卷积把卷积步长设置为 1 的目的是保持特征宽高不变，假设卷积核尺寸为 (2m+1, 2n+1)，则还需要在左右各填充 m 行，上下各填充 n 行，这样卷积前后特征的宽高尺寸就相等了。下面使用 CUDA 实现的卷积：步长只能为 1，输入、输出的通道也都只能为 1；特征尺寸和卷积核尺寸可以任意指定。1. 工具函数void generate_mat(float* mat, size_t h, size_t w) { for (siz
复制链接

扫一扫

专栏目录