使用Pytorch的 C++ (CUDA)扩展实现卷积算子

使用Pytorch的 C++ (CUDA)扩展实现卷积算子

卷积算子

convop

图1. 卷积算子实现的两个阶段。为了方便画,这里仅考虑一个通道时的情况。

在实现卷积算子时,将卷积算子等价转换为两个阶段:收集(collecting)和 线性变换(linear transform)。在收集阶段,“image to column” 和 “column to image” 过程分别在前向传播中将输入的 X ∈ R B × C × H × W X\in \mathbb{R}^{B\times C\times H\times W} XRB×C×H×W 转换为 H ∈ R B × ( C ⋅ K h ⋅ K w ) × H o u t × W o u t H \in \mathbb{R}^{B\times (C\cdot K_h\cdot K_w)\times H_{out}\times W_{out}} HRB×(CKhKw)×Hout×Wout 和 在反向传播中将传递得到的 H g r a d ∈ R B × ( C ⋅ K h ⋅ K w ) × H o u t × W o u t H_{grad} \in \mathbb{R}^{B\times (C\cdot K_h\cdot K_w)\times H_{out}\times W_{out}} HgradRB×(CKhKw)×Hout×Wout 逆向转换为 X g r a d ∈ R B × C × H × W X_{grad}\in \mathbb{R}^{B\times C\times H\times W} XgradRB×C×H×W。在线性变换阶段,卷积核权重对前向传播中得到的 H H H 进行加权,或根据 H g r a d H_{grad} Hgrad 得到对应的权重梯度 W g r a d W_{grad} Wgrad

代码

这里根据设备不同提供两种实现方式,一种是在CPU设备上串行实现,另一种是在GPU设备上并行实现。

CPU 串行版本(Python)
from torch.autograd import Function
# 用于找到对应的数据位置坐标
def data_index_init(offset, clist, *args):
    if not args:
        return offset
    X, *rest = args
    offset = data_index_init(offset, clist, *rest)
    tmp = offset % X
    clist.append(tmp)
    # print(offset, x)
    return offset // X

# image to column
def im2col_cpu(img,  # 图像张量
               kernel_h, kernel_w, # 卷积核大小
               pad_h, pad_w, # 填充
               stride_h, stride_w, # 跨步
               dilation_h, dilation_w ): # 卷积核膨胀系数
    '''
        img: (N, C, H, W)
    '''
    b,c,h,w = img.shape
    out_h = (h - (dilation_h*(kernel_h - 1) + 1) + 2*pad_h)//stride_h + 1
    out_w = (w - (dilation_w*(kernel_w - 1) + 1) + 2*pad_w)//stride_w + 1
    out_c = c*kernel_h*kernel_w

    col = torch.zeros(b, out_c, out_h, out_w).to(img.device)

    for col_c in range(out_c):
        clist = []
        data_index_init(col_c, clist, c, kernel_h, kernel_w)
        c_im, offset_h, offset_w = clist[::-1]
        for col_h in range(out_h):
            h_im = col_h*stride_h - pad_h + offset_h*dilation_h
            for col_w in range(out_w):
                w_im = col_w*stride_w - pad_w + offset_w*dilation_w
                if h_im < h and h_im >=0 and w_im < w and w_im >= 0:
                    col[:, col_c, col_h, col_w] = img[:, c_im, h_im, w_im]
    return col

# column to image
def col2im_cpu(col_gd,
               c, h, w,
               kernel_h, kernel_w,
               pad_h, pad_w,
               stride_h, stride_w,
               dilation_h, dilation_w):
    '''
        col: (N, C', H', W')
    '''
    b,out_c,out_h,out_w = col_gd.shape
    img_gd = torch.zeros(b, c, h, w)
    for col_c in range(out_c):
        clist = []
        data_index_init(col_c, clist, c, kernel_h, kernel_w)
        c_im, offset_h, offset_w = clist[::-1]
        for col_h in range(out_h):
            h_im = col_h*stride_h - pad_h + offset_h*dilation_h
            for col_w in range(out_w):
                w_im = col_w*stride_w - pad_w + offset_w*dilation_w
                if h_im < h and h_im >=0 and w_im < w and w_im >= 0:
                    img_gd[:, c_im, h_im, w_im] += col_gd[:, col_c, col_h, col_w]
    return img_gd

# 前向和反向传播,自动求导
class Conv2d_function_cpu(Function):
    @staticmethod
    def forward(ctx, X, weight, bias, kernel_size, stride, padding, dilation):
        '''
            X: (B,C,H,W)
            weight: (C',C,Kh,Kw)
            bias: (1,)
        '''
        kernel_h, kernel_w = kernel_size
        pad_h, pad_w = padding
        dilation_h, dilation_w = dilation
        stride_h, stride_w = stride
        b,c,h,w = X.shape
        c1,_,_,_ = weight.shape
        ctx.kernel_size = kernel_size
        ctx.padding = padding
        ctx.dilation = dilation
        ctx.stride = stride
        ctx.img_size = (h, w)
        # im2col
        col_X = im2col_cpu(X, kernel_h, kernel_w,
                        pad_h, pad_w, stride_h, stride_w,
                        dilation_h, dilation_w) # (B, C*Kh*Kw, H', W')
        ctx.save_for_backward(col_X, weight, bias)
        bias = bias.reshape(1,1,1,1).repeat(len(X),1,1,1) if bias is not None else bias
        # linear projection
        out = col_X.permute(0,2,3,1) @ weight.permute(1,2,3,0).reshape(-1, c1) # (B, H', W', C*Kh*Kw) x (C*Kh*Kw, C') -> (B, H', W', C')
        out = out.permute(0,3,1,2) # (B, C', H', W')
        # add bias
        out = out + bias if bias is not None else out
        return out
    @staticmethod
    def backward(ctx, out_gd):
        col_X, weight, bias = ctx.saved_tensors
        kernel_h, kernel_w = ctx.kernel_size
        stride_h, stride_w = ctx.stride
        pad_h, pad_w = ctx.padding
        dilation_h, dilation_w = ctx.dilation
        h, w = ctx.img_size
        c1,c,kernel_h,kernel_w = weight.shape
        # gradient of bias
        bias_gd = out_gd.sum(dim=(1,2,3)) if bias is not None else None
        bias_gd = torch.mean(bias_gd, dim=(0,), keepdim=True)
        # gradient of weight
        weight_gd = out_gd.permute(1,0,2,3).reshape(c1, -1) @ col_X.permute(0,2,3,1).reshape(-1, c*kernel_h*kernel_w) # (C', C*Kh*Kw)
        weight_gd = weight_gd.reshape(c1,c,kernel_h,kernel_w)
        # gradient of X
        X_gd = out_gd.permute(0,2,3,1) @ weight.reshape(c1, -1) # (B, H', W', C') x (C', C*Kh*Kw) -> (B, H', W', C*Kh*Kw)
        X_gd = X_gd.permute(0,3,1,2)
        # col2im
        X_gd = col2im_cpu(X_gd, c, h, w,
                          kernel_h, kernel_w,
                          pad_h, pad_w, 
                          stride_h, stride_w,
                          dilation_h, dilation_w)
        return X_gd, weight_gd, bias_gd, None, None, None, None
GPU 并行版本(Python,C++,CUDA混合编程)
  • CUDA 代码

    头文件 im2col_col2im.cuh

    #include <algorithm>
    #include <cstdio>
    #include <cstring>
    
    #include <ATen/ATen.h>
    #include <ATen/OpMathType.h>
    #include <ATen/cuda/CUDAContext.h>
    #include <THC/THCAtomics.cuh>
    
    #define CUDA_KERNEL_LOOP(i, n)                                                 \
        for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);               \
            i += blockDim.x * gridDim.x)
    
    inline int GET_BLOCKS(const int N, const int num_threads) {
        return (N + num_threads - 1) / num_threads;
    }
    
    #define opmath_t at::opmath_type<scalar_t>
    
    template <typename scalar_t>
    __global__ void im2col_kernel(const int64_t n,
                                    const scalar_t* img, scalar_t* col, 
                                    const int64_t h, const int64_t w,
                                    const int64_t kernel_h, const int64_t kernel_w,
                                    const int64_t pad_h, const int64_t pad_w,
                                    const int64_t stride_h, const int64_t stride_w,
                                    const int64_t dilation_h, const int64_t dilation_w,
                                    const int64_t col_h, const int64_t col_w
    ){
        CUDA_KERNEL_LOOP(index, n){
            int64_t w_out = index % col_w;
            int64_t h_out = (index / col_w) % col_h;
            int64_t c_in = (index / col_w) / col_h;
            int64_t c_out = c_in * kernel_h * kernel_w;
            int64_t h_in = h_out * stride_h - pad_h;
            int64_t w_in = w_out * stride_w - pad_w;
    
            scalar_t* col_idx = col + (c_out * col_h + h_out) * col_w + w_out;
            const scalar_t* img_idx = img + (c_in * h + h_in) * w + w_in;
    
            for(int64_t i=0;i<kernel_h;i++){
                int64_t h_k = h_in + i * dilation_h;
                for(int64_t j=0;j<kernel_w;j++){
                    int64_t w_k = w_in + j * dilation_w;
                    if(h_k >= 0 && h_k < h && w_k >= 0 && w_k < w)
                        *col_idx = img_idx[i*dilation_h*w + j*dilation_w];
                    col += col_h * col_w;
                }
            }
        }
    }
    
    template <typename scalar_t>
    void im2col(
        cudaStream_t stream,
        const scalar_t* img, scalar_t* col,
        const int64_t c, const int64_t h, const int64_t w,
        const int64_t col_h, const int64_t col_w,
        const int64_t kernel_h, const int64_t kernel_w,
        const int64_t pad_h, const int64_t pad_w,
        const int64_t stride_h, const int64_t stride_w,
        const int64_t dilation_h, const int64_t dilation_w
    ){
        int64_t num_kernels = c * h * w;
        im2col_kernel<scalar_t>
        <<<GET_BLOCKS(num_kernels, 1024), 1024, 0, stream>>>(
            num_kernels,
            img, col,
            h, w,
            kernel_h, kernel_w,
            pad_h, pad_w,
            stride_h, stride_w,
            dilation_h, dilation_w,
            col_h, col_w );
    }
    
    template <typename scalar_t>
    __global__ void col2im_kernel(const int64_t n,
                                    const scalar_t* col, scalar_t* img, 
                                    const int64_t h, const int64_t w,
                                    const int64_t kernel_h, const int64_t kernel_w,
                                    const int64_t pad_h, const int64_t pad_w,
                                    const int64_t stride_h, const int64_t stride_w,
                                    const int64_t dilation_h, const int64_t dilation_w,
                                    const int64_t col_h, const int64_t col_w
    ){
        CUDA_KERNEL_LOOP(index, n){
            const int64_t w_im = index % w + pad_w;
            const int64_t h_im = (index / w) % h + pad_h;
            const int64_t c_im = index / (w * h);
            int64_t dilat_k_h = (kernel_h - 1)* dilation_h + 1;
            int64_t dilat_k_w = (kernel_w - 1)* dilation_w + 1;
            const int64_t w_col_s = (w_im < dilat_k_w) ? 0 : (w_im - dilat_k_w) / stride_w + 1;
            const int64_t w_col_e = ::min(w_im/stride_w + 1, col_w);
            const int64_t h_col_s = (h_im < stride_h) ? 0 : (h_im - dilat_k_h) / stride_h + 1;
            const int64_t h_col_e = ::min(h_im / stride_h + 1, col_h);
    
            scalar_t val = static_cast<scalar_t>(0);
    
            for(int64_t h_col = h_col_s; h_col < h_col_e; h_col += 1){
                int64_t h_k = (h_im - h_col * stride_h);
                for(int64_t w_col = w_col_s; w_col < w_col_e; w_col += 1){
                    int64_t w_k = (w_im - w_col * stride_w);
                    if(h_k % dilation_h == 0 && w_k % dilation_w == 0){
                        h_k /= dilation_h;
                        w_k /= dilation_w;
                        int64_t data_col_index = ((((c_im * kernel_h + h_k)* kernel_w + w_k)* col_h + h_col)* col_w + w_col);
                        val += col[data_col_index];
                    }
                }
            }
            img[index] = val;
        }
    }
    
    template <typename scalar_t>
    void col2im(
        cudaStream_t stream,
        const scalar_t* col, scalar_t* img,
        const int64_t c, const int64_t h, const int64_t w,
        const int64_t col_h, const int64_t col_w,
        const int64_t kernel_h, const int64_t kernel_w,
        const int64_t pad_h, const int64_t pad_w,
        const int64_t stride_h, const int64_t stride_w,
        const int64_t dilation_h, const int64_t dilation_w
    ){
        int64_t num_kernels = c * h * w;
        col2im_kernel<scalar_t>
        <<<GET_BLOCKS(num_kernels, 1024), 1024, 0, stream>>>(
            num_kernels,
            col, img,
            h, w,
            kernel_h, kernel_w,
            pad_h, pad_w,
            stride_h, stride_w,
            dilation_h, dilation_w,
            col_h, col_w );
    }
    

    主体文件 im2col_col2im.cu

    #include "im2col_col2im.cuh"
    
    #include <ATen/ATen.h>
    #include <ATen/cuda/CUDAContext.h>
    #include <cuda.h>
    #include <cuda_runtime.h>
    #include <torch/torch.h>
    
    at::Tensor conv2d_im2col(const at::Tensor &input,
                            const int kernel_h, const int kernel_w,
                            const int stride_h, const int stride_w,
                            const int pad_h, const int pad_w,
                            const int dilation_h, const int dilation_w
    ){
        AT_ASSERTM(input.is_contiguous(), "input tensor has to be contiguous");
        AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
        const int b = input.size(0);
        const int c = input.size(1);
        const int h = input.size(2);
        const int w = input.size(3);
        const int col_h = (h - (dilation_h*(kernel_h - 1) + 1) + 2*pad_h)/stride_h + 1;
        const int col_w = (w - (dilation_w*(kernel_w - 1) + 1) + 2*pad_w)/stride_w + 1;
        const int col_c = c * kernel_h * kernel_w;
        auto output = at::zeros({b, col_c, col_h, col_w}, input.options());
        for(int n = 0; n < b; n++){
            auto slice_input = input.select(0, n);
            auto slice_output = output.select(0, n);
            AT_DISPATCH_FLOATING_TYPES_AND_HALF(
                input.type(), "conv2d_im2col_selfdefine_forward", ([&]{
                    im2col(
                        at::cuda::getCurrentCUDAStream(),
                        slice_input.data<scalar_t>(), slice_output.data<scalar_t>(),
                        c, h, w,
                        col_h, col_w,
                        kernel_h, kernel_w,
                        pad_h, pad_w,
                        stride_h, stride_w,
                        dilation_h, dilation_w );
                }));
        }
        return output;
    }
    
    at::Tensor conv2d_col2im(
        const at::Tensor &input, const at::Tensor &grad_output,
        const int kernel_h, const int kernel_w,
        const int stride_h, const int stride_w,
        const int pad_h, const int pad_w,
        const int dilation_h, const int dilation_w
    ){
        AT_ASSERTM(input.is_contiguous(), "input tensor has to be contiguous");
        AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
        AT_ASSERTM(grad_output.is_contiguous(), "gradient of output tensor has to be contiguous");
        AT_ASSERTM(grad_output.type().is_cuda(), "gradient of output tensor must be a CUDA tensor");
        
        const int b = input.size(0);
        const int c = input.size(1);
        const int h = input.size(2);
        const int w = input.size(3);
        const int col_h = grad_output.size(2);
        const int col_w = grad_output.size(3);
        const int col_c = grad_output.size(1);
    
        auto grad_input = at::zeros_like(input, input.dtype());
    
        for(int n = 0; n < b; n ++){
            auto slice_grad_input = grad_input.select(0, n);
            auto slice_grad_output = grad_output.select(0, n);
            AT_DISPATCH_FLOATING_TYPES_AND_HALF(
                input.type(), "conv2d_col2im_selfdefine_backward", ([&]{
                    col2im(
                        at::cuda::getCurrentCUDAStream(),
                        slice_grad_output.data<scalar_t>(), slice_grad_input.data<scalar_t>(),
                        c, h, w,
                        col_h, col_w,
                        kernel_h, kernel_w,
                        pad_h, pad_w,
                        stride_h, stride_w,
                        dilation_h, dilation_w );
                }));
        }
    
        return grad_input;
    }
    
  • C++代码,使用Pybind将CUDA代码和Python解释器绑定

    文件 im2col_vision.cpp

    #include <torch/extension.h>
    
    at::Tensor conv2d_im2col(const at::Tensor &input,
                            const int kernel_h, const int kernel_w,
                            const int stride_h, const int stride_w,
                            const int pad_h, const int pad_w,
                            const int dilation_h, const int dilation_w
    );
    
    at::Tensor conv2d_col2im(
        const at::Tensor &input, const at::Tensor &grad_output,
        const int kernel_h, const int kernel_w,
        const int stride_h, const int stride_w,
        const int pad_h, const int pad_w,
        const int dilation_h, const int dilation_w
    );
    
    PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
        m.def("conv2d_im2col_cuda", &conv2d_im2col, "im2col_cuda");
        m.def("conv2d_col2im_cuda", &conv2d_col2im, "col2im_cuda");
    }
    
  • 打包编译文件

    文件 setup.py,在项目目录中使用命令 python setup.py install 将其编译安装到python中。

    import os
    import glob
    
    import torch
    
    from torch.utils.cpp_extension import CUDA_HOME
    from torch.utils.cpp_extension import CppExtension
    from torch.utils.cpp_extension import CUDAExtension
    
    from setuptools import find_packages
    from setuptools import setup
    
    requirements = ["torch", "torchvision"]
    
    
    def get_extensions():
        this_dir = os.path.dirname(os.path.abspath(__file__))
        extensions_dir = os.path.join(this_dir, "src")
    
        main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
        source_cuda = glob.glob(os.path.join(extensions_dir, "*.cu"))
    
        sources = main_file
        extension = CppExtension
        extra_compile_args = {"cxx": []}
        define_macros = []
    
        if torch.cuda.is_available() and CUDA_HOME is not None:
            extension = CUDAExtension
            sources += source_cuda
            define_macros += [("WITH_CUDA", None)]
            extra_compile_args["nvcc"] = [
                # "-DCUDA_HAS_FP16=1",
                # "-D__CUDA_NO_HALF_OPERATORS__",
                # "-D__CUDA_NO_HALF_CONVERSIONS__",
                # "-D__CUDA_NO_HALF2_OPERATORS__",
            ]
        else:
            raise NotImplementedError('Cuda is not availabel')
    
        sources = [os.path.join(extensions_dir, s) for s in sources]
        include_dirs = [extensions_dir]
        ext_modules = [
            extension(
                "conv2d_im2col_col2im_selfdefine",
                sources,
                include_dirs=include_dirs,
                define_macros=define_macros,
                extra_compile_args=extra_compile_args,
            )
        ]
        return ext_modules
    
    setup(
        name="conv2d_im2col_col2im_selfdefine",
        version="1.0",
        author="Mark Yu (Zewen Yu)",
        url=None,
        description=
        "PyTorch Wrapper for CUDA Functions of Core Operations Im2col and Col2im",
        packages=find_packages(exclude=(
            "configs",
            "tests",
        )),
        ext_modules=get_extensions(),
        cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
    )
    
  • 自动求导

    文件 funciton.py

    import torch
    import torch.nn.functional as F
    from torch.autograd import Function
    
    from conv2d_im2col_col2im_selfdefine import conv2d_im2col_cuda, conv2d_col2im_cuda
    
    class Conv2d_function_gpu(Function):
        @staticmethod
        def forward(ctx, X, weight, bias, kernel_size, stride, padding, dilation):
            '''
                X: (B,C,H,W)
                weight: (C',C,Kh,Kw)
                bias: (1,)
            '''
            kernel_h, kernel_w = kernel_size
            pad_h, pad_w = padding
            dilation_h, dilation_w = dilation
            stride_h, stride_w = stride
            b,c,h,w = X.shape
            c1,_,_,_ = weight.shape
            ctx.kernel_size = kernel_size
            ctx.padding = padding
            ctx.dilation = dilation
            ctx.stride = stride
            ctx.img_size = (h, w)
            # im2col
            col_X = conv2d_im2col_cuda(X, 
                                    kernel_h, kernel_w,
                                    stride_h, stride_w,
                                    pad_h, pad_w,
                                    dilation_h, dilation_w)
            ctx.save_for_backward(X, col_X, weight, bias)
            bias = bias.reshape(1,1,1,1).repeat(b,1,1,1) if bias is not None else None
            # linear projection
            out = col_X.permute(0,2,3,1) @ weight.permute(1,2,3,0).reshape(-1, c1)
            out = out.permute(0,3,1,2)
            # add bias
            out = out + bias if bias is not None else out
            return out
        
        @staticmethod
        def backward(ctx, out_gd):
            X, col_X, weight, bias = ctx.saved_tensors
            kernel_h, kernel_w = ctx.kernel_size
            stride_h, stride_w = ctx.stride
            pad_h, pad_w = ctx.padding
            dilation_h, dilation_w = ctx.dilation
            h, w = ctx.img_size
            c1,c,kernel_h,kernel_w = weight.shape
            # gradient of bias
            bias_gd = out_gd.sum(dim=(1,2,3)) if bias is not None else None
            bias_gd = torch.mean(bias_gd, dim=(0,), keepdim=True)
            # gradient of weight
            weight_gd = out_gd.permute(1,0,2,3).reshape(c1, -1) @ col_X.permute(0,2,3,1).reshape(-1, c*kernel_h*kernel_w) # (C', C*Kh*Kw)
            weight_gd = weight_gd.reshape(c1,c,kernel_h,kernel_w)
            # gradient of X
            X_gd = out_gd.permute(0,2,3,1) @ weight.reshape(c1, -1)
            X_gd = X_gd.permute(0,3,1,2).contiguous()
            # col2im
            X_gd = conv2d_col2im_cuda(X, X_gd,
                                    kernel_h, kernel_w,
                                    stride_h, stride_w,
                                    pad_h, pad_w,
                                    dilation_h, dilation_w)
            return X_gd, weight_gd, bias_gd, None, None, None, None
    
卷积层实现

文件 conv2d_selfdefine.py

import torch, math
from torch.nn import Module, Parameter, init
from ..functions.function import Conv2d_function_gpu, Conv2d_function_cpu

class Conv2d(Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation=1, bias=True):
        super().__init__()
        if isinstance(kernel_size, int):
            kernel_size = (kernel_size, kernel_size)
        if isinstance(stride, int):
            stride = (stride, stride)
        if isinstance(padding, int):
            padding = (padding, padding)
        if isinstance(dilation, int):
            dilation = (dilation, dilation)
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.dilation = dilation
        self.weight = Parameter(torch.empty((out_channels, in_channels, *kernel_size)))
        self.bias = Parameter(torch.empty((1,))) if bias else None
        self._reset_parameters()

    def _reset_parameters(self):
        init.kaiming_normal_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
            bound = 1. / math.sqrt(fan_in) if fan_in > 0 else 0
            init.uniform_(self.bias, -bound, bound)
    
    def forward(self, X):
        assert X.shape[1] == self.weight.shape[1], f'inputs channels is not equal to the channels of weight: {X.shape[1]} != {self.weight.shape[1]}'
        assert X.device == self.weight.device, f'inputs and weight should be on same device, {X.device} != {self.weight.device}'
        if self.bias is not None:
            assert X.device == self.bias.device, f'inputs and bias should be on same device, {X.device} != {self.bias.device}' 
        
        if X.is_cuda:
            out = Conv2d_function_gpu.apply(X, self.weight, self.bias, 
                                            self.kernel_size, self.stride, self.padding, self.dilation)
        else:
            out = Conv2d_function_cpu.apply(X, self.weight, self.bias, 
                                            self.kernel_size, self.stride, self.padding, self.dilation)
        return out

实验

  • 设备和软件信息
设备或软件版本
CPUIntel® Xeon® CPU E5-2695 v2 @ 2.40GHz, 24
GPUNVIDIA GeForce RTX 3060, 1
系统Ubuntu 20.04.6 LTS (GNU/Linux 5.15.0-119-generic x86_64)
Pytorch2.2.1+cu121
CUDA12.2
nvccrelease 12.1, V12.1.105 Build cuda_12.1.r12.1/compiler.32688072_0
  • 目录结构

目录名称 im2col_col2im_selfdef

.
├── functions
│   ├── function.py
│   └── __init__.py
├── modules
│   ├── conv2d_selfdefine.py
│   └── __init__.py
├── setup.py
└── src
    ├── im2col_col2im.cu
    ├── im2col_col2im.cuh
    └── im2col_vision.cpp

3 directories, 8 files
  • 测试文件和结果

测试文件 test_selfdef_conv2d.py

import torch
from im2col_col2im_selfdef.modules import Conv2d

inputs = torch.randn(2,3,56,56)
inputs_gpu = torch.randn(2,3,56,56).cuda()

conv2d = Conv2d(3, 16, 3, 1, 1)
conv2d_gpu = Conv2d(3, 16, 3, 1, 1).cuda()

print('Test for our selfdefine conv2d on cpu')
print('if gradient of weight exists: ', conv2d.weight.grad!=None)
outputs = conv2d(inputs)
loss = outputs.sum()
loss.backward()
print('if gradient of weight exists: ', conv2d.weight.grad!=None, 'sum of it: ', conv2d.weight.grad.sum())

print('Test for our selfdefine conv2d on gpu')
print('if gradient of weight exists: ', conv2d_gpu.weight.grad!=None)
outputs_gpu = conv2d_gpu(inputs_gpu)
loss_gpu = outputs_gpu.sum()
loss_gpu.backward()
print('if gradient of weight exists: ', conv2d_gpu.weight.grad!=None, 'sum of it: ', conv2d_gpu.weight.grad.sum())

终端输出

root@I1c0b18178e0050191d:~# python test.py
Test for our selfdefine conv2d on cpu
if gradient of weight exists:  False
if gradient of weight exists:  True sum of it:  tensor(-8846.4951)
Test for our selfdefine conv2d on gpu
if gradient of weight exists:  False
if gradient of weight exists:  True sum of it:  tensor(3104.9429, device='cuda:0')

项目地址

代码放在 github 上了, 链接 https://github.com/MarcYugo/a-conv2d-example-self-define-build

参考资料

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值