使用Pytorch的 C++ (CUDA)扩展实现卷积算子

欲买桂载酒

已于 2024-08-26 16:07:57 修改

阅读量450

点赞数 3

分类专栏：人工神经网络文章标签： pytorch c++ 深度学习神经网络 cnn

于 2024-08-26 12:48:51 首次发布

本文链接：https://blog.csdn.net/weixin_43385826/article/details/141561148

版权

人工神经网络专栏收录该内容

3 篇文章 0 订阅

订阅专栏

文章目录

- 使用Pytorch的 C++ (CUDA)扩展实现卷积算子

使用Pytorch的 C++ (CUDA)扩展实现卷积算子

卷积算子

convop

图1. 卷积算子实现的两个阶段。为了方便画，这里仅考虑一个通道时的情况。

在实现卷积算子时，将卷积算子等价转换为两个阶段：收集（collecting）和线性变换（linear transform）。在收集阶段，“image to column” 和 “column to image” 过程分别在前向传播中将输入的 $X\in \mathbb{R}^{B\times C\times H\times W}$ 转换为 $\in \mathbb{R}^{B\times (C\cdot K_h\cdot K_w)\times H_{out}\times W_{out}}$ 和在反向传播中将传递得到的 $H_{grad} \in \mathbb{R}^{B\times (C\cdot K_h\cdot K_w)\times H_{out}\times W_{out}}$ 逆向转换为 $X_{grad}\in \mathbb{R}^{B\times C\times H\times W}$ 。在线性变换阶段，卷积核权重对前向传播中得到的 $H$ 进行加权，或根据 $H_{grad}$ 得到对应的权重梯度 $W_{grad}$ 。

代码

这里根据设备不同提供两种实现方式，一种是在CPU设备上串行实现，另一种是在GPU设备上并行实现。

CPU 串行版本（Python）

from torch.autograd import Function
# 用于找到对应的数据位置坐标
def data_index_init(offset, clist, *args):
    if not args:
        return offset
    X, *rest = args
    offset = data_index_init(offset, clist, *rest)
    tmp = offset % X
    clist.append(tmp)
    # print(offset, x)
    return offset // X

# image to column
def im2col_cpu(img,  # 图像张量
               kernel_h, kernel_w, # 卷积核大小
               pad_h, pad_w, # 填充
               stride_h, stride_w, # 跨步
               dilation_h, dilation_w ): # 卷积核膨胀系数
    '''
        img: (N, C, H, W)
    '''
    b,c,h,w = img.shape
    out_h = (h - (dilation_h*(kernel_h - 1) + 1) + 2*pad_h)//stride_h + 1
    out_w = (w - (dilation_w*(kernel_w - 1) + 1) + 2*pad_w)//stride_w + 1
    out_c = c*kernel_h*kernel_w

    col = torch.zeros(b, out_c, out_h, out_w).to(img.device)

    for col_c in range(out_c):
        clist = []
        data_index_init(col_c, clist, c, kernel_h, kernel_w)
        c_im, offset_h, offset_w = clist[::-1]
        for col_h in range(out_h):
            h_im = col_h*stride_h - pad_h + offset_h*dilation_h
            for col_w in range(out_w):
                w_im = col_w*stride_w - pad_w + offset_w*dilation_w
                if h_im < h and h_im >=0 and w_im < w and w_im >= 0:
                    col[:, col_c, col_h, col_w] = img[:, c_im, h_im, w_im]
    return col

# column to image
def col2im_cpu(col_gd,
               c, h, w,
               kernel_h, kernel_w,
               pad_h, pad_w,
               stride_h, stride_w,
               dilation_h, dilation_w):
    '''
        col: (N, C', H', W')
    '''
    b,out_c,out_h,out_w = col_gd.shape
    img_gd = torch.zeros(b, c, h, w)
    for col_c in range(out_c):
        clist = []
        data_index_init(col_c, clist, c, kernel_h, kernel_w)
        c_im, offset_h, offset_w = clist[::-1]
        for col_h in range(out_h):
            h_im = col_h*stride_h - pad_h + offset_h*dilation_h
            for col_w in range(out_w):
                w_im = col_w*stride_w - pad_w + offset_w*dilation_w
                if h_im < h and h_im >=0 and w_im < w and w_im >= 0:
                    img_gd[:, c_im, h_im, w_im] += col_gd[:, col_c, col_h, col_w]
    return img_gd

# 前向和反向传播，自动求导
class Conv2d_function_cpu(Function):
    @staticmethod
    def forward(ctx, X, weight, bias, kernel_size, stride, padding, dilation):
        '''
            X: (B,C,H,W)
            weight: (C',C,Kh,Kw)
            bias: (1,)
        '''
        kernel_h, kernel_w = kernel_size
        pad_h, pad_w = padding
        dilation_h, dilation_w = dilation
        stride_h, stride_w = stride
        b,c,h,w = X.shape
        c1,_,_,_ = weight.shape
        ctx.kernel_size = kernel_size
        ctx.padding = padding
        ctx.dilation = dilation
        ctx.stride = stride
        ctx.img_size = (h, w)
        # im2col
        col_X = im2col_cpu(X, kernel_h, kernel_w,
                        pad_h, pad_w, stride_h, stride_w,
                        dilation_h, dilation_w) # (B, C*Kh*Kw, H', W')
        ctx.save_for_backward(col_X, weight, bias)
        bias = bias.reshape(1,1,1,1).repeat(len(X),1,1,1) if bias is not None else bias
        # linear projection
        out = col_X.permute(0,2,3,1) @ weight.permute(1,2,3,0).reshape(-1, c1) # (B, H', W', C*Kh*Kw) x (C*Kh*Kw, C') -> (B, H', W', C')
        out = out.permute(0,3,1,2) # (B, C', H', W')
        # add bias
        out = out + bias if bias is not None else out
        return out
    @staticmethod
    def backward(ctx, out_gd):
        col_X, weight, bias = ctx.saved_tensors
        kernel_h, kernel_w = ctx.kernel_size
        stride_h, stride_w = ctx.stride
        pad_h, pad_w = ctx.padding
        dilation_h, dilation_w = ctx.dilation
        h, w = ctx.img_size
        c1,c,kernel_h,kernel_w = weight.shape
        # gradient of bias
        bias_gd = out_gd.sum(dim=(1,2,3)) if bias is not None else None
        bias_gd = torch.mean(bias_gd, dim=(0,), keepdim=True)
        # gradient of weight
        weight_gd = out_gd.permute(1,0,2,3).reshape(c1, -1) @ col_X.permute(0,2,3,1).reshape(-1, c*kernel_h*kernel_w) # (C', C*Kh*Kw)
        weight_gd = weight_gd.reshape(c1,c,kernel_h,kernel_w)
        # gradient of X
        X_gd = out_gd.permute(0,2,3,1) @ weight.reshape(c1, -1) # (B, H', W', C') x (C', C*Kh*Kw) -> (B, H', W', C*Kh*Kw)
        X_gd = X_gd.permute(0,3,1,2)
        # col2im
        X_gd = col2im_cpu(X_gd, c, h, w,
                          kernel_h, kernel_w,
                          pad_h, pad_w, 
                          stride_h, stride_w,
                          dilation_h, dilation_w)
        return X_gd, weight_gd, bias_gd, None, None, None, None

GPU 并行版本（Python,C++,CUDA混合编程）

CUDA 代码

头文件 im2col_col2im.cuh

#include <algorithm>
#include <cstdio>
#include <cstring>

#include <ATen/ATen.h>
#include <ATen/OpMathType.h>
#include <ATen/cuda/CUDAContext.h>
#include <THC/THCAtomics.cuh>

#define CUDA_KERNEL_LOOP(i, n)                                                 \
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);               \
        i += blockDim.x * gridDim.x)

inline int GET_BLOCKS(const int N, const int num_threads) {
    return (N + num_threads - 1) / num_threads;
}

#define opmath_t at::opmath_type<scalar_t>

template <typename scalar_t>
__global__ void im2col_kernel(const int64_t n,
                                const scalar_t* img, scalar_t* col, 
                                const int64_t h, const int64_t w,
                                const int64_t kernel_h, const int64_t kernel_w,
                                const int64_t pad_h, const int64_t pad_w,
                                const int64_t stride_h, const int64_t stride_w,
                                const int64_t dilation_h, const int64_t dilation_w,
                                const int64_t col_h, const int64_t col_w
){
    CUDA_KERNEL_LOOP(index, n){
        int64_t w_out = index % col_w;
        int64_t h_out = (index / col_w) % col_h;
        int64_t c_in = (index / col_w) / col_h;
        int64_t c_out = c_in * kernel_h * kernel_w;
        int64_t h_in = h_out * stride_h - pad_h;
        int64_t w_in = w_out * stride_w - pad_w;

        scalar_t* col_idx = col + (c_out * col_h + h_out) * col_w + w_out;
        const scalar_t* img_idx = img + (c_in * h + h_in) * w + w_in;

        for(int64_t i=0;i<kernel_h;i++){
            int64_t h_k = h_in + i * dilation_h;
            for(int64_t j=0;j<kernel_w;j++){
                int64_t w_k = w_in + j * dilation_w;
                if(h_k >= 0 && h_k < h && w_k >= 0 && w_k < w)
                    *col_idx = img_idx[i*dilation_h*w + j*dilation_w];
                col += col_h * col_w;
            }
        }
    }
}

template <typename scalar_t>
void im2col(
    cudaStream_t stream,
    const scalar_t* img, scalar_t* col,
    const int64_t c, const int64_t h, const int64_t w,
    const int64_t col_h, const int64_t col_w,
    const int64_t kernel_h, const int64_t kernel_w,
    const int64_t pad_h, const int64_t pad_w,
    const int64_t stride_h, const int64_t stride_w,
    const int64_t dilation_h, const int64_t dilation_w
){
    int64_t num_kernels = c * h * w;
    im2col_kernel<scalar_t>
    <<<GET_BLOCKS(num_kernels, 1024), 1024, 0, stream>>>(
        num_kernels,
        img, col,
        h, w,
        kernel_h, kernel_w,
        pad_h, pad_w,
        stride_h, stride_w,
        dilation_h, dilation_w,
        col_h, col_w );
}

template <typename scalar_t>
__global__ void col2im_kernel(const int64_t n,
                                const scalar_t* col, scalar_t* img, 
                                const int64_t h, const int64_t w,
                                const int64_t kernel_h, const int64_t kernel_w,
                                const int64_t pad_h, const int64_t pad_w,
                                const int64_t stride_h, const int64_t stride_w,
                                const int64_t dilation_h, const int64_t dilation_w,
                                const int64_t col_h, const int64_t col_w
){
    CUDA_KERNEL_LOOP(index, n){
        const int64_t w_im = index % w + pad_w;
        const int64_t h_im = (index / w) % h + pad_h;
        const int64_t c_im = index / (w * h);
        int64_t dilat_k_h = (kernel_h - 1)* dilation_h + 1;
        int64_t dilat_k_w = (kernel_w - 1)* dilation_w + 1;
        const int64_t w_col_s = (w_im < dilat_k_w) ? 0 : (w_im - dilat_k_w) / stride_w + 1;
        const int64_t w_col_e = ::min(w_im/stride_w + 1, col_w);
        const int64_t h_col_s = (h_im < stride_h) ? 0 : (h_im - dilat_k_h) / stride_h + 1;
        const int64_t h_col_e = ::min(h_im / stride_h + 1, col_h);

        scalar_t val = static_cast<scalar_t>(0);

        for(int64_t h_col = h_col_s; h_col < h_col_e; h_col += 1){
            int64_t h_k = (h_im - h_col * stride_h);
            for(int64_t w_col = w_col_s; w_col < w_col_e; w_col += 1){
                int64_t w_k = (w_im - w_col * stride_w);
                if(h_k % dilation_h == 0 && w_k % dilation_w == 0){
                    h_k /= dilation_h;
                    w_k /= dilation_w;
                    int64_t data_col_index = ((((c_im * kernel_h + h_k)* kernel_w + w_k)* col_h + h_col)* col_w + w_col);
                    val += col[data_col_index];
                }
            }
        }
        img[index] = val;
    }
}

template <typename scalar_t>
void col2im(
    cudaStream_t stream,
    const scalar_t* col, scalar_t* img,
    const int64_t c, const int64_t h, const int64_t w,
    const int64_t col_h, const int64_t col_w,
    const int64_t kernel_h, const int64_t kernel_w,
    const int64_t pad_h, const int64_t pad_w,
    const int64_t stride_h, const int64_t stride_w,
    const int64_t dilation_h, const int64_t dilation_w
){
    int64_t num_kernels = c * h * w;
    col2im_kernel<scalar_t>
    <<<GET_BLOCKS(num_kernels, 1024), 1024, 0, stream>>>(
        num_kernels,
        col, img,
        h, w,
        kernel_h, kernel_w,
        pad_h, pad_w,
        stride_h, stride_w,
        dilation_h, dilation_w,
        col_h, col_w );
}

主体文件 im2col_col2im.cu

#include "im2col_col2im.cuh"

#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <torch/torch.h>

at::Tensor conv2d_im2col(const at::Tensor &input,
                        const int kernel_h, const int kernel_w,
                        const int stride_h, const int stride_w,
                        const int pad_h, const int pad_w,
                        const int dilation_h, const int dilation_w
){
    AT_ASSERTM(input.is_contiguous(), "input tensor has to be contiguous");
    AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
    const int b = input.size(0);
    const int c = input.size(1);
    const int h = input.size(2);
    const int w = input.size(3);
    const int col_h = (h - (dilation_h*(kernel_h - 1) + 1) + 2*pad_h)/stride_h + 1;
    const int col_w = (w - (dilation_w*(kernel_w - 1) + 1) + 2*pad_w)/stride_w + 1;
    const int col_c = c * kernel_h * kernel_w;
    auto output = at::zeros({b, col_c, col_h, col_w}, input.options());
    for(int n = 0; n < b; n++){
        auto slice_input = input.select(0, n);
        auto slice_output = output.select(0, n);
        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
            input.type(), "conv2d_im2col_selfdefine_forward", ([&]{
                im2col(
                    at::cuda::getCurrentCUDAStream(),
                    slice_input.data<scalar_t>(), slice_output.data<scalar_t>(),
                    c, h, w,
                    col_h, col_w,
                    kernel_h, kernel_w,
                    pad_h, pad_w,
                    stride_h, stride_w,
                    dilation_h, dilation_w );
            }));
    }
    return output;
}

at::Tensor conv2d_col2im(
    const at::Tensor &input, const at::Tensor &grad_output,
    const int kernel_h, const int kernel_w,
    const int stride_h, const int stride_w,
    const int pad_h, const int pad_w,
    const int dilation_h, const int dilation_w
){
    AT_ASSERTM(input.is_contiguous(), "input tensor has to be contiguous");
    AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
    AT_ASSERTM(grad_output.is_contiguous(), "gradient of output tensor has to be contiguous");
    AT_ASSERTM(grad_output.type().is_cuda(), "gradient of output tensor must be a CUDA tensor");
    
    const int b = input.size(0);
    const int c = input.size(1);
    const int h = input.size(2);
    const int w = input.size(3);
    const int col_h = grad_output.size(2);
    const int col_w = grad_output.size(3);
    const int col_c = grad_output.size(1);

    auto grad_input = at::zeros_like(input, input.dtype());

    for(int n = 0; n < b; n ++){
        auto slice_grad_input = grad_input.select(0, n);
        auto slice_grad_output = grad_output.select(0, n);
        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
            input.type(), "conv2d_col2im_selfdefine_backward", ([&]{
                col2im(
                    at::cuda::getCurrentCUDAStream(),
                    slice_grad_output.data<scalar_t>(), slice_grad_input.data<scalar_t>(),
                    c, h, w,
                    col_h, col_w,
                    kernel_h, kernel_w,
                    pad_h, pad_w,
                    stride_h, stride_w,
                    dilation_h, dilation_w );
            }));
    }

    return grad_input;
}

C++代码，使用Pybind将CUDA代码和Python解释器绑定

文件 im2col_vision.cpp

#include <torch/extension.h>

at::Tensor conv2d_im2col(const at::Tensor &input,
                        const int kernel_h, const int kernel_w,
                        const int stride_h, const int stride_w,
                        const int pad_h, const int pad_w,
                        const int dilation_h, const int dilation_w
);

at::Tensor conv2d_col2im(
    const at::Tensor &input, const at::Tensor &grad_output,
    const int kernel_h, const int kernel_w,
    const int stride_h, const int stride_w,
    const int pad_h, const int pad_w,
    const int dilation_h, const int dilation_w
);

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("conv2d_im2col_cuda", &conv2d_im2col, "im2col_cuda");
    m.def("conv2d_col2im_cuda", &conv2d_col2im, "col2im_cuda");
}

打包编译文件

文件 setup.py，在项目目录中使用命令 python setup.py install 将其编译安装到python中。

import os
import glob

import torch

from torch.utils.cpp_extension import CUDA_HOME
from torch.utils.cpp_extension import CppExtension
from torch.utils.cpp_extension import CUDAExtension

from setuptools import find_packages
from setuptools import setup

requirements = ["torch", "torchvision"]


def get_extensions():
    this_dir = os.path.dirname(os.path.abspath(__file__))
    extensions_dir = os.path.join(this_dir, "src")

    main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
    source_cuda = glob.glob(os.path.join(extensions_dir, "*.cu"))

    sources = main_file
    extension = CppExtension
    extra_compile_args = {"cxx": []}
    define_macros = []

    if torch.cuda.is_available() and CUDA_HOME is not None:
        extension = CUDAExtension
        sources += source_cuda
        define_macros += [("WITH_CUDA", None)]
        extra_compile_args["nvcc"] = [
            # "-DCUDA_HAS_FP16=1",
            # "-D__CUDA_NO_HALF_OPERATORS__",
            # "-D__CUDA_NO_HALF_CONVERSIONS__",
            # "-D__CUDA_NO_HALF2_OPERATORS__",
        ]
    else:
        raise NotImplementedError('Cuda is not availabel')

    sources = [os.path.join(extensions_dir, s) for s in sources]
    include_dirs = [extensions_dir]
    ext_modules = [
        extension(
            "conv2d_im2col_col2im_selfdefine",
            sources,
            include_dirs=include_dirs,
            define_macros=define_macros,
            extra_compile_args=extra_compile_args,
        )
    ]
    return ext_modules

setup(
    name="conv2d_im2col_col2im_selfdefine",
    version="1.0",
    author="Mark Yu (Zewen Yu)",
    url=None,
    description=
    "PyTorch Wrapper for CUDA Functions of Core Operations Im2col and Col2im",
    packages=find_packages(exclude=(
        "configs",
        "tests",
    )),
    ext_modules=get_extensions(),
    cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
)

自动求导

文件 funciton.py

import torch
import torch.nn.functional as F
from torch.autograd import Function

from conv2d_im2col_col2im_selfdefine import conv2d_im2col_cuda, conv2d_col2im_cuda

class Conv2d_function_gpu(Function):
    @staticmethod
    def forward(ctx, X, weight, bias, kernel_size, stride, padding, dilation):
        '''
            X: (B,C,H,W)
            weight: (C',C,Kh,Kw)
            bias: (1,)
        '''
        kernel_h, kernel_w = kernel_size
        pad_h, pad_w = padding
        dilation_h, dilation_w = dilation
        stride_h, stride_w = stride
        b,c,h,w = X.shape
        c1,_,_,_ = weight.shape
        ctx.kernel_size = kernel_size
        ctx.padding = padding
        ctx.dilation = dilation
        ctx.stride = stride
        ctx.img_size = (h, w)
        # im2col
        col_X = conv2d_im2col_cuda(X, 
                                kernel_h, kernel_w,
                                stride_h, stride_w,
                                pad_h, pad_w,
                                dilation_h, dilation_w)
        ctx.save_for_backward(X, col_X, weight, bias)
        bias = bias.reshape(1,1,1,1).repeat(b,1,1,1) if bias is not None else None
        # linear projection
        out = col_X.permute(0,2,3,1) @ weight.permute(1,2,3,0).reshape(-1, c1)
        out = out.permute(0,3,1,2)
        # add bias
        out = out + bias if bias is not None else out
        return out
    
    @staticmethod
    def backward(ctx, out_gd):
        X, col_X, weight, bias = ctx.saved_tensors
        kernel_h, kernel_w = ctx.kernel_size
        stride_h, stride_w = ctx.stride
        pad_h, pad_w = ctx.padding
        dilation_h, dilation_w = ctx.dilation
        h, w = ctx.img_size
        c1,c,kernel_h,kernel_w = weight.shape
        # gradient of bias
        bias_gd = out_gd.sum(dim=(1,2,3)) if bias is not None else None
        bias_gd = torch.mean(bias_gd, dim=(0,), keepdim=True)
        # gradient of weight
        weight_gd = out_gd.permute(1,0,2,3).reshape(c1, -1) @ col_X.permute(0,2,3,1).reshape(-1, c*kernel_h*kernel_w) # (C', C*Kh*Kw)
        weight_gd = weight_gd.reshape(c1,c,kernel_h,kernel_w)
        # gradient of X
        X_gd = out_gd.permute(0,2,3,1) @ weight.reshape(c1, -1)
        X_gd = X_gd.permute(0,3,1,2).contiguous()
        # col2im
        X_gd = conv2d_col2im_cuda(X, X_gd,
                                kernel_h, kernel_w,
                                stride_h, stride_w,
                                pad_h, pad_w,
                                dilation_h, dilation_w)
        return X_gd, weight_gd, bias_gd, None, None, None, None

卷积层实现

文件 conv2d_selfdefine.py

import torch, math
from torch.nn import Module, Parameter, init
from ..functions.function import Conv2d_function_gpu, Conv2d_function_cpu

class Conv2d(Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation=1, bias=True):
        super().__init__()
        if isinstance(kernel_size, int):
            kernel_size = (kernel_size, kernel_size)
        if isinstance(stride, int):
            stride = (stride, stride)
        if isinstance(padding, int):
            padding = (padding, padding)
        if isinstance(dilation, int):
            dilation = (dilation, dilation)
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.dilation = dilation
        self.weight = Parameter(torch.empty((out_channels, in_channels, *kernel_size)))
        self.bias = Parameter(torch.empty((1,))) if bias else None
        self._reset_parameters()

    def _reset_parameters(self):
        init.kaiming_normal_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
            bound = 1. / math.sqrt(fan_in) if fan_in > 0 else 0
            init.uniform_(self.bias, -bound, bound)
    
    def forward(self, X):
        assert X.shape[1] == self.weight.shape[1], f'inputs channels is not equal to the channels of weight: {X.shape[1]} != {self.weight.shape[1]}'
        assert X.device == self.weight.device, f'inputs and weight should be on same device, {X.device} != {self.weight.device}'
        if self.bias is not None:
            assert X.device == self.bias.device, f'inputs and bias should be on same device, {X.device} != {self.bias.device}' 
        
        if X.is_cuda:
            out = Conv2d_function_gpu.apply(X, self.weight, self.bias, 
                                            self.kernel_size, self.stride, self.padding, self.dilation)
        else:
            out = Conv2d_function_cpu.apply(X, self.weight, self.bias, 
                                            self.kernel_size, self.stride, self.padding, self.dilation)
        return out

实验

设备和软件信息

设备或软件	版本
CPU	Intel® Xeon® CPU E5-2695 v2 @ 2.40GHz, 24
GPU	NVIDIA GeForce RTX 3060, 1
系统	Ubuntu 20.04.6 LTS (GNU/Linux 5.15.0-119-generic x86_64)
Pytorch	2.2.1+cu121
CUDA	12.2
nvcc	release 12.1, V12.1.105 Build cuda_12.1.r12.1/compiler.32688072_0

目录名称 im2col_col2im_selfdef

.
├── functions
│   ├── function.py
│   └── __init__.py
├── modules
│   ├── conv2d_selfdefine.py
│   └── __init__.py
├── setup.py
└── src
    ├── im2col_col2im.cu
    ├── im2col_col2im.cuh
    └── im2col_vision.cpp

3 directories, 8 files

测试文件和结果

测试文件 test_selfdef_conv2d.py

import torch
from im2col_col2im_selfdef.modules import Conv2d

inputs = torch.randn(2,3,56,56)
inputs_gpu = torch.randn(2,3,56,56).cuda()

conv2d = Conv2d(3, 16, 3, 1, 1)
conv2d_gpu = Conv2d(3, 16, 3, 1, 1).cuda()

print('Test for our selfdefine conv2d on cpu')
print('if gradient of weight exists: ', conv2d.weight.grad!=None)
outputs = conv2d(inputs)
loss = outputs.sum()
loss.backward()
print('if gradient of weight exists: ', conv2d.weight.grad!=None, 'sum of it: ', conv2d.weight.grad.sum())

print('Test for our selfdefine conv2d on gpu')
print('if gradient of weight exists: ', conv2d_gpu.weight.grad!=None)
outputs_gpu = conv2d_gpu(inputs_gpu)
loss_gpu = outputs_gpu.sum()
loss_gpu.backward()
print('if gradient of weight exists: ', conv2d_gpu.weight.grad!=None, 'sum of it: ', conv2d_gpu.weight.grad.sum())

终端输出

root@I1c0b18178e0050191d:~# python test.py
Test for our selfdefine conv2d on cpu
if gradient of weight exists:  False
if gradient of weight exists:  True sum of it:  tensor(-8846.4951)
Test for our selfdefine conv2d on gpu
if gradient of weight exists:  False
if gradient of weight exists:  True sum of it:  tensor(3104.9429, device='cuda:0')