文章目录
使用Pytorch的 C++ (CUDA)扩展实现卷积算子
卷积算子
在实现卷积算子时,将卷积算子等价转换为两个阶段:收集(collecting)和 线性变换(linear transform)。在收集阶段,“image to column” 和 “column to image” 过程分别在前向传播中将输入的 X ∈ R B × C × H × W X\in \mathbb{R}^{B\times C\times H\times W} X∈RB×C×H×W 转换为 H ∈ R B × ( C ⋅ K h ⋅ K w ) × H o u t × W o u t H \in \mathbb{R}^{B\times (C\cdot K_h\cdot K_w)\times H_{out}\times W_{out}} H∈RB×(C⋅Kh⋅Kw)×Hout×Wout 和 在反向传播中将传递得到的 H g r a d ∈ R B × ( C ⋅ K h ⋅ K w ) × H o u t × W o u t H_{grad} \in \mathbb{R}^{B\times (C\cdot K_h\cdot K_w)\times H_{out}\times W_{out}} Hgrad∈RB×(C⋅Kh⋅Kw)×Hout×Wout 逆向转换为 X g r a d ∈ R B × C × H × W X_{grad}\in \mathbb{R}^{B\times C\times H\times W} Xgrad∈RB×C×H×W。在线性变换阶段,卷积核权重对前向传播中得到的 H H H 进行加权,或根据 H g r a d H_{grad} Hgrad 得到对应的权重梯度 W g r a d W_{grad} Wgrad。
代码
这里根据设备不同提供两种实现方式,一种是在CPU设备上串行实现,另一种是在GPU设备上并行实现。
CPU 串行版本(Python)
from torch.autograd import Function
# 用于找到对应的数据位置坐标
def data_index_init(offset, clist, *args):
if not args:
return offset
X, *rest = args
offset = data_index_init(offset, clist, *rest)
tmp = offset % X
clist.append(tmp)
# print(offset, x)
return offset // X
# image to column
def im2col_cpu(img, # 图像张量
kernel_h, kernel_w, # 卷积核大小
pad_h, pad_w, # 填充
stride_h, stride_w, # 跨步
dilation_h, dilation_w ): # 卷积核膨胀系数
'''
img: (N, C, H, W)
'''
b,c,h,w = img.shape
out_h = (h - (dilation_h*(kernel_h - 1) + 1) + 2*pad_h)//stride_h + 1
out_w = (w - (dilation_w*(kernel_w - 1) + 1) + 2*pad_w)//stride_w + 1
out_c = c*kernel_h*kernel_w
col = torch.zeros(b, out_c, out_h, out_w).to(img.device)
for col_c in range(out_c):
clist = []
data_index_init(col_c, clist, c, kernel_h, kernel_w)
c_im, offset_h, offset_w = clist[::-1]
for col_h in range(out_h):
h_im = col_h*stride_h - pad_h + offset_h*dilation_h
for col_w in range(out_w):
w_im = col_w*stride_w - pad_w + offset_w*dilation_w
if h_im < h and h_im >=0 and w_im < w and w_im >= 0:
col[:, col_c, col_h, col_w] = img[:, c_im, h_im, w_im]
return col
# column to image
def col2im_cpu(col_gd,
c, h, w,
kernel_h, kernel_w,
pad_h, pad_w,
stride_h, stride_w,
dilation_h, dilation_w):
'''
col: (N, C', H', W')
'''
b,out_c,out_h,out_w = col_gd.shape
img_gd = torch.zeros(b, c, h, w)
for col_c in range(out_c):
clist = []
data_index_init(col_c, clist, c, kernel_h, kernel_w)
c_im, offset_h, offset_w = clist[::-1]
for col_h in range(out_h):
h_im = col_h*stride_h - pad_h + offset_h*dilation_h
for col_w in range(out_w):
w_im = col_w*stride_w - pad_w + offset_w*dilation_w
if h_im < h and h_im >=0 and w_im < w and w_im >= 0:
img_gd[:, c_im, h_im, w_im] += col_gd[:, col_c, col_h, col_w]
return img_gd
# 前向和反向传播,自动求导
class Conv2d_function_cpu(Function):
@staticmethod
def forward(ctx, X, weight, bias, kernel_size, stride, padding, dilation):
'''
X: (B,C,H,W)
weight: (C',C,Kh,Kw)
bias: (1,)
'''
kernel_h, kernel_w = kernel_size
pad_h, pad_w = padding
dilation_h, dilation_w = dilation
stride_h, stride_w = stride
b,c,h,w = X.shape
c1,_,_,_ = weight.shape
ctx.kernel_size = kernel_size
ctx.padding = padding
ctx.dilation = dilation
ctx.stride = stride
ctx.img_size = (h, w)
# im2col
col_X = im2col_cpu(X, kernel_h, kernel_w,
pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w) # (B, C*Kh*Kw, H', W')
ctx.save_for_backward(col_X, weight, bias)
bias = bias.reshape(1,1,1,1).repeat(len(X),1,1,1) if bias is not None else bias
# linear projection
out = col_X.permute(0,2,3,1) @ weight.permute(1,2,3,0).reshape(-1, c1) # (B, H', W', C*Kh*Kw) x (C*Kh*Kw, C') -> (B, H', W', C')
out = out.permute(0,3,1,2) # (B, C', H', W')
# add bias
out = out + bias if bias is not None else out
return out
@staticmethod
def backward(ctx, out_gd):
col_X, weight, bias = ctx.saved_tensors
kernel_h, kernel_w = ctx.kernel_size
stride_h, stride_w = ctx.stride
pad_h, pad_w = ctx.padding
dilation_h, dilation_w = ctx.dilation
h, w = ctx.img_size
c1,c,kernel_h,kernel_w = weight.shape
# gradient of bias
bias_gd = out_gd.sum(dim=(1,2,3)) if bias is not None else None
bias_gd = torch.mean(bias_gd, dim=(0,), keepdim=True)
# gradient of weight
weight_gd = out_gd.permute(1,0,2,3).reshape(c1, -1) @ col_X.permute(0,2,3,1).reshape(-1, c*kernel_h*kernel_w) # (C', C*Kh*Kw)
weight_gd = weight_gd.reshape(c1,c,kernel_h,kernel_w)
# gradient of X
X_gd = out_gd.permute(0,2,3,1) @ weight.reshape(c1, -1) # (B, H', W', C') x (C', C*Kh*Kw) -> (B, H', W', C*Kh*Kw)
X_gd = X_gd.permute(0,3,1,2)
# col2im
X_gd = col2im_cpu(X_gd, c, h, w,
kernel_h, kernel_w,
pad_h, pad_w,
stride_h, stride_w,
dilation_h, dilation_w)
return X_gd, weight_gd, bias_gd, None, None, None, None
GPU 并行版本(Python,C++,CUDA混合编程)
-
CUDA 代码
头文件 im2col_col2im.cuh
#include <algorithm> #include <cstdio> #include <cstring> #include <ATen/ATen.h> #include <ATen/OpMathType.h> #include <ATen/cuda/CUDAContext.h> #include <THC/THCAtomics.cuh> #define CUDA_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ i += blockDim.x * gridDim.x) inline int GET_BLOCKS(const int N, const int num_threads) { return (N + num_threads - 1) / num_threads; } #define opmath_t at::opmath_type<scalar_t> template <typename scalar_t> __global__ void im2col_kernel(const int64_t n, const scalar_t* img, scalar_t* col, const int64_t h, const int64_t w, const int64_t kernel_h, const int64_t kernel_w, const int64_t pad_h, const int64_t pad_w, const int64_t stride_h, const int64_t stride_w, const int64_t dilation_h, const int64_t dilation_w, const int64_t col_h, const int64_t col_w ){ CUDA_KERNEL_LOOP(index, n){ int64_t w_out = index % col_w; int64_t h_out = (index / col_w) % col_h; int64_t c_in = (index / col_w) / col_h; int64_t c_out = c_in * kernel_h * kernel_w; int64_t h_in = h_out * stride_h - pad_h; int64_t w_in = w_out * stride_w - pad_w; scalar_t* col_idx = col + (c_out * col_h + h_out) * col_w + w_out; const scalar_t* img_idx = img + (c_in * h + h_in) * w + w_in; for(int64_t i=0;i<kernel_h;i++){ int64_t h_k = h_in + i * dilation_h; for(int64_t j=0;j<kernel_w;j++){ int64_t w_k = w_in + j * dilation_w; if(h_k >= 0 && h_k < h && w_k >= 0 && w_k < w) *col_idx = img_idx[i*dilation_h*w + j*dilation_w]; col += col_h * col_w; } } } } template <typename scalar_t> void im2col( cudaStream_t stream, const scalar_t* img, scalar_t* col, const int64_t c, const int64_t h, const int64_t w, const int64_t col_h, const int64_t col_w, const int64_t kernel_h, const int64_t kernel_w, const int64_t pad_h, const int64_t pad_w, const int64_t stride_h, const int64_t stride_w, const int64_t dilation_h, const int64_t dilation_w ){ int64_t num_kernels = c * h * w; im2col_kernel<scalar_t> <<<GET_BLOCKS(num_kernels, 1024), 1024, 0, stream>>>( num_kernels, img, col, h, w, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, col_h, col_w ); } template <typename scalar_t> __global__ void col2im_kernel(const int64_t n, const scalar_t* col, scalar_t* img, const int64_t h, const int64_t w, const int64_t kernel_h, const int64_t kernel_w, const int64_t pad_h, const int64_t pad_w, const int64_t stride_h, const int64_t stride_w, const int64_t dilation_h, const int64_t dilation_w, const int64_t col_h, const int64_t col_w ){ CUDA_KERNEL_LOOP(index, n){ const int64_t w_im = index % w + pad_w; const int64_t h_im = (index / w) % h + pad_h; const int64_t c_im = index / (w * h); int64_t dilat_k_h = (kernel_h - 1)* dilation_h + 1; int64_t dilat_k_w = (kernel_w - 1)* dilation_w + 1; const int64_t w_col_s = (w_im < dilat_k_w) ? 0 : (w_im - dilat_k_w) / stride_w + 1; const int64_t w_col_e = ::min(w_im/stride_w + 1, col_w); const int64_t h_col_s = (h_im < stride_h) ? 0 : (h_im - dilat_k_h) / stride_h + 1; const int64_t h_col_e = ::min(h_im / stride_h + 1, col_h); scalar_t val = static_cast<scalar_t>(0); for(int64_t h_col = h_col_s; h_col < h_col_e; h_col += 1){ int64_t h_k = (h_im - h_col * stride_h); for(int64_t w_col = w_col_s; w_col < w_col_e; w_col += 1){ int64_t w_k = (w_im - w_col * stride_w); if(h_k % dilation_h == 0 && w_k % dilation_w == 0){ h_k /= dilation_h; w_k /= dilation_w; int64_t data_col_index = ((((c_im * kernel_h + h_k)* kernel_w + w_k)* col_h + h_col)* col_w + w_col); val += col[data_col_index]; } } } img[index] = val; } } template <typename scalar_t> void col2im( cudaStream_t stream, const scalar_t* col, scalar_t* img, const int64_t c, const int64_t h, const int64_t w, const int64_t col_h, const int64_t col_w, const int64_t kernel_h, const int64_t kernel_w, const int64_t pad_h, const int64_t pad_w, const int64_t stride_h, const int64_t stride_w, const int64_t dilation_h, const int64_t dilation_w ){ int64_t num_kernels = c * h * w; col2im_kernel<scalar_t> <<<GET_BLOCKS(num_kernels, 1024), 1024, 0, stream>>>( num_kernels, col, img, h, w, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, col_h, col_w ); }
主体文件 im2col_col2im.cu
#include "im2col_col2im.cuh" #include <ATen/ATen.h> #include <ATen/cuda/CUDAContext.h> #include <cuda.h> #include <cuda_runtime.h> #include <torch/torch.h> at::Tensor conv2d_im2col(const at::Tensor &input, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w ){ AT_ASSERTM(input.is_contiguous(), "input tensor has to be contiguous"); AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); const int b = input.size(0); const int c = input.size(1); const int h = input.size(2); const int w = input.size(3); const int col_h = (h - (dilation_h*(kernel_h - 1) + 1) + 2*pad_h)/stride_h + 1; const int col_w = (w - (dilation_w*(kernel_w - 1) + 1) + 2*pad_w)/stride_w + 1; const int col_c = c * kernel_h * kernel_w; auto output = at::zeros({b, col_c, col_h, col_w}, input.options()); for(int n = 0; n < b; n++){ auto slice_input = input.select(0, n); auto slice_output = output.select(0, n); AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.type(), "conv2d_im2col_selfdefine_forward", ([&]{ im2col( at::cuda::getCurrentCUDAStream(), slice_input.data<scalar_t>(), slice_output.data<scalar_t>(), c, h, w, col_h, col_w, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w ); })); } return output; } at::Tensor conv2d_col2im( const at::Tensor &input, const at::Tensor &grad_output, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w ){ AT_ASSERTM(input.is_contiguous(), "input tensor has to be contiguous"); AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); AT_ASSERTM(grad_output.is_contiguous(), "gradient of output tensor has to be contiguous"); AT_ASSERTM(grad_output.type().is_cuda(), "gradient of output tensor must be a CUDA tensor"); const int b = input.size(0); const int c = input.size(1); const int h = input.size(2); const int w = input.size(3); const int col_h = grad_output.size(2); const int col_w = grad_output.size(3); const int col_c = grad_output.size(1); auto grad_input = at::zeros_like(input, input.dtype()); for(int n = 0; n < b; n ++){ auto slice_grad_input = grad_input.select(0, n); auto slice_grad_output = grad_output.select(0, n); AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.type(), "conv2d_col2im_selfdefine_backward", ([&]{ col2im( at::cuda::getCurrentCUDAStream(), slice_grad_output.data<scalar_t>(), slice_grad_input.data<scalar_t>(), c, h, w, col_h, col_w, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w ); })); } return grad_input; }
-
C++代码,使用Pybind将CUDA代码和Python解释器绑定
文件 im2col_vision.cpp
#include <torch/extension.h> at::Tensor conv2d_im2col(const at::Tensor &input, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w ); at::Tensor conv2d_col2im( const at::Tensor &input, const at::Tensor &grad_output, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w ); PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("conv2d_im2col_cuda", &conv2d_im2col, "im2col_cuda"); m.def("conv2d_col2im_cuda", &conv2d_col2im, "col2im_cuda"); }
-
打包编译文件
文件 setup.py,在项目目录中使用命令 python setup.py install 将其编译安装到python中。
import os import glob import torch from torch.utils.cpp_extension import CUDA_HOME from torch.utils.cpp_extension import CppExtension from torch.utils.cpp_extension import CUDAExtension from setuptools import find_packages from setuptools import setup requirements = ["torch", "torchvision"] def get_extensions(): this_dir = os.path.dirname(os.path.abspath(__file__)) extensions_dir = os.path.join(this_dir, "src") main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) source_cuda = glob.glob(os.path.join(extensions_dir, "*.cu")) sources = main_file extension = CppExtension extra_compile_args = {"cxx": []} define_macros = [] if torch.cuda.is_available() and CUDA_HOME is not None: extension = CUDAExtension sources += source_cuda define_macros += [("WITH_CUDA", None)] extra_compile_args["nvcc"] = [ # "-DCUDA_HAS_FP16=1", # "-D__CUDA_NO_HALF_OPERATORS__", # "-D__CUDA_NO_HALF_CONVERSIONS__", # "-D__CUDA_NO_HALF2_OPERATORS__", ] else: raise NotImplementedError('Cuda is not availabel') sources = [os.path.join(extensions_dir, s) for s in sources] include_dirs = [extensions_dir] ext_modules = [ extension( "conv2d_im2col_col2im_selfdefine", sources, include_dirs=include_dirs, define_macros=define_macros, extra_compile_args=extra_compile_args, ) ] return ext_modules setup( name="conv2d_im2col_col2im_selfdefine", version="1.0", author="Mark Yu (Zewen Yu)", url=None, description= "PyTorch Wrapper for CUDA Functions of Core Operations Im2col and Col2im", packages=find_packages(exclude=( "configs", "tests", )), ext_modules=get_extensions(), cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, )
-
自动求导
文件 funciton.py
import torch import torch.nn.functional as F from torch.autograd import Function from conv2d_im2col_col2im_selfdefine import conv2d_im2col_cuda, conv2d_col2im_cuda class Conv2d_function_gpu(Function): @staticmethod def forward(ctx, X, weight, bias, kernel_size, stride, padding, dilation): ''' X: (B,C,H,W) weight: (C',C,Kh,Kw) bias: (1,) ''' kernel_h, kernel_w = kernel_size pad_h, pad_w = padding dilation_h, dilation_w = dilation stride_h, stride_w = stride b,c,h,w = X.shape c1,_,_,_ = weight.shape ctx.kernel_size = kernel_size ctx.padding = padding ctx.dilation = dilation ctx.stride = stride ctx.img_size = (h, w) # im2col col_X = conv2d_im2col_cuda(X, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w) ctx.save_for_backward(X, col_X, weight, bias) bias = bias.reshape(1,1,1,1).repeat(b,1,1,1) if bias is not None else None # linear projection out = col_X.permute(0,2,3,1) @ weight.permute(1,2,3,0).reshape(-1, c1) out = out.permute(0,3,1,2) # add bias out = out + bias if bias is not None else out return out @staticmethod def backward(ctx, out_gd): X, col_X, weight, bias = ctx.saved_tensors kernel_h, kernel_w = ctx.kernel_size stride_h, stride_w = ctx.stride pad_h, pad_w = ctx.padding dilation_h, dilation_w = ctx.dilation h, w = ctx.img_size c1,c,kernel_h,kernel_w = weight.shape # gradient of bias bias_gd = out_gd.sum(dim=(1,2,3)) if bias is not None else None bias_gd = torch.mean(bias_gd, dim=(0,), keepdim=True) # gradient of weight weight_gd = out_gd.permute(1,0,2,3).reshape(c1, -1) @ col_X.permute(0,2,3,1).reshape(-1, c*kernel_h*kernel_w) # (C', C*Kh*Kw) weight_gd = weight_gd.reshape(c1,c,kernel_h,kernel_w) # gradient of X X_gd = out_gd.permute(0,2,3,1) @ weight.reshape(c1, -1) X_gd = X_gd.permute(0,3,1,2).contiguous() # col2im X_gd = conv2d_col2im_cuda(X, X_gd, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w) return X_gd, weight_gd, bias_gd, None, None, None, None
卷积层实现
文件 conv2d_selfdefine.py
import torch, math
from torch.nn import Module, Parameter, init
from ..functions.function import Conv2d_function_gpu, Conv2d_function_cpu
class Conv2d(Module):
def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation=1, bias=True):
super().__init__()
if isinstance(kernel_size, int):
kernel_size = (kernel_size, kernel_size)
if isinstance(stride, int):
stride = (stride, stride)
if isinstance(padding, int):
padding = (padding, padding)
if isinstance(dilation, int):
dilation = (dilation, dilation)
self.kernel_size = kernel_size
self.stride = stride
self.padding = padding
self.dilation = dilation
self.weight = Parameter(torch.empty((out_channels, in_channels, *kernel_size)))
self.bias = Parameter(torch.empty((1,))) if bias else None
self._reset_parameters()
def _reset_parameters(self):
init.kaiming_normal_(self.weight, a=math.sqrt(5))
if self.bias is not None:
fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
bound = 1. / math.sqrt(fan_in) if fan_in > 0 else 0
init.uniform_(self.bias, -bound, bound)
def forward(self, X):
assert X.shape[1] == self.weight.shape[1], f'inputs channels is not equal to the channels of weight: {X.shape[1]} != {self.weight.shape[1]}'
assert X.device == self.weight.device, f'inputs and weight should be on same device, {X.device} != {self.weight.device}'
if self.bias is not None:
assert X.device == self.bias.device, f'inputs and bias should be on same device, {X.device} != {self.bias.device}'
if X.is_cuda:
out = Conv2d_function_gpu.apply(X, self.weight, self.bias,
self.kernel_size, self.stride, self.padding, self.dilation)
else:
out = Conv2d_function_cpu.apply(X, self.weight, self.bias,
self.kernel_size, self.stride, self.padding, self.dilation)
return out
实验
- 设备和软件信息
设备或软件 | 版本 |
---|---|
CPU | Intel® Xeon® CPU E5-2695 v2 @ 2.40GHz, 24 |
GPU | NVIDIA GeForce RTX 3060, 1 |
系统 | Ubuntu 20.04.6 LTS (GNU/Linux 5.15.0-119-generic x86_64) |
Pytorch | 2.2.1+cu121 |
CUDA | 12.2 |
nvcc | release 12.1, V12.1.105 Build cuda_12.1.r12.1/compiler.32688072_0 |
- 目录结构
目录名称 im2col_col2im_selfdef
.
├── functions
│ ├── function.py
│ └── __init__.py
├── modules
│ ├── conv2d_selfdefine.py
│ └── __init__.py
├── setup.py
└── src
├── im2col_col2im.cu
├── im2col_col2im.cuh
└── im2col_vision.cpp
3 directories, 8 files
- 测试文件和结果
测试文件 test_selfdef_conv2d.py
import torch
from im2col_col2im_selfdef.modules import Conv2d
inputs = torch.randn(2,3,56,56)
inputs_gpu = torch.randn(2,3,56,56).cuda()
conv2d = Conv2d(3, 16, 3, 1, 1)
conv2d_gpu = Conv2d(3, 16, 3, 1, 1).cuda()
print('Test for our selfdefine conv2d on cpu')
print('if gradient of weight exists: ', conv2d.weight.grad!=None)
outputs = conv2d(inputs)
loss = outputs.sum()
loss.backward()
print('if gradient of weight exists: ', conv2d.weight.grad!=None, 'sum of it: ', conv2d.weight.grad.sum())
print('Test for our selfdefine conv2d on gpu')
print('if gradient of weight exists: ', conv2d_gpu.weight.grad!=None)
outputs_gpu = conv2d_gpu(inputs_gpu)
loss_gpu = outputs_gpu.sum()
loss_gpu.backward()
print('if gradient of weight exists: ', conv2d_gpu.weight.grad!=None, 'sum of it: ', conv2d_gpu.weight.grad.sum())
终端输出
root@I1c0b18178e0050191d:~# python test.py
Test for our selfdefine conv2d on cpu
if gradient of weight exists: False
if gradient of weight exists: True sum of it: tensor(-8846.4951)
Test for our selfdefine conv2d on gpu
if gradient of weight exists: False
if gradient of weight exists: True sum of it: tensor(3104.9429, device='cuda:0')
项目地址
代码放在 github 上了, 链接 https://github.com/MarcYugo/a-conv2d-example-self-define-build。