1.非平行运算 non parallel computation
2.大量串列计算 lots of sequential computation
x=f1(x) x=f2(x)...x=fn(x) 层数多才明显
cuda可以把函数融合为一个函数 只需要一次运算
pytorch-->C++(桥梁)-->cuda平行运算后回传
trilinear interpolation 三线性插值
创建interpolation.cpp,声明引入tensor的#include <torch/extension.h> 报错
vscode 中Ctrl+Shift+P打开命令面板,找到Edit configuration(json)
在includepath中输入:
"includePath": [
"${workspaceFolder}/**",
"/home/lys/miniconda3/envs/mmrotate/include/python3.8",
"/home/lys/miniconda3/envs/mmrotate/lib/python3.8/site-packages/torch/include",
"/home/lys/miniconda3/envs/mmrotate/lib/python3.8/site-packages/torch/include/torch/csrc/api/include"
vscode 中Ctrl+Shift+P打开命令面板,输入Python: Select Interpreter配置环境
interpolation.cpp:
#include <torch/extension.h>//tensor进入,需要让c+=知道
//定义一个调cuda的函数
torch::Tensor trilinear_interpolation(
torch::Tensor feats,//八个点的特征
torch::Tensor point
) {
return feats;
}
//提供一个python调c++的桥梁
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
//""python名称,c++函数
m.def("trilinear_interpolation", &trilinear_interpolation);
}
为了build C++文件,创建setup.py
from setuptools import setup
from torch.utils.cpp_extension import CppExtension, BuildExtension
setup(
name = 'cppcuda_tutorial',
version = '1.0',
author = 'lys',
author_email = '2322349003@qq.com',
description = 'cppcuda example',
ext_modules = [#需要build的代码在哪里
CppExtension (
name = 'cppcuda_tutorial',
sources = ['interpolation.cpp'])#代码指定
],
cmdclass = {
'build_ext' : BuildExtension#需要build
}
)
查看pip --version
pip install .#安装当前目录下的c++代码
创建test.py,测试build和代码是否正常运行
import torch #必须先引入torch才可以引入自建c++文件
import cppcuda_tutorial
feats = torch.ones(2)
point = torch.zeros(2)
out = cppcuda_tutorial.trilinear_interpolation(feats, point)
print(out)
输出
(mmrotate) (base) lys@lys:~/pytorchcppcuda$ python test.py
tensor([1., 1.])
cuda平行计算原理,通过block让整体运算资源达到更多
CPU中呼叫Kernel-->GPU中的Grid-->n*Block(上限2^31 * 2^8)-->细分为n*nThread(上限1024)
trilinear 运算公式
bilinear 双线性插值
f = uvf4+(1-u)vf3+u(1-v)f2+(1-u)(1-v)f1
三线性插值 uvw
feats:(N,8,F) N个正方体,8个顶点,每个顶点对应的特征
points:(N,3) N个点,3D
如何平行运算:
1.N平行
2.F平行
创建 interpolation_kernel.cu,要在C++中呼叫cuda
//平行运算
#include <torch/extension.h>
torch::Tensor trilinear_fw_cu(//forward计算的cu函数
torch::Tensor feats,//八个点的特征
torch::Tensor points
) {
return feats;
}
创建include文件夹,创建utils.h保存头文件, 引入声明,函数头和magicline CHECK INPUT是不是可用的tensor
#include <torch/extension.h>
//检查是否是cuda tensor
#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x "must be a CUDA tensor")
//检查展平后tensor是否连续
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x. is_contiguous(), #x "must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
torch::Tensor trilinear_fw_cu(//forward计算的cu函数
torch::Tensor feats,//八个点的特征
torch::Tensor points
) ;
C++中 include "utils.h"
检查输入是否是cuda tensor,是否连续
最后回传给cuda kernel
#include <torch/extension.h>//tensor进入,需要让c+=知道
#include "utils.h"
//定义一个调cuda的函数
torch::Tensor trilinear_interpolation(
torch::Tensor feats,//八个点的特征
torch::Tensor points
) {
CHECK_INPUT(feats);
CHECK_INPUT(points);
return trilinear_fw_cu(feats, points);
}
//提供一个python调c++的桥梁
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
//""python名称,c++函数
m.def("trilinear_interpolation", &trilinear_interpolation);
}
修改setup.py
import glob
import os.path as osp
from setuptools import setup
from torch.utils.cpp_extension import CUDAExtension, BuildExtension #改为cuda扩展
ROOT_DIR = osp.dirname(osp.abspath(__file__))
include_dirs = [osp.join(ROOT_DIR, "include")]
sources = glob.glob('*.cpp') + glob.glob('*.cu') #glob包将c++档案全部叫出list
setup(
name = 'cppcuda_tutorial',
version = '1.0',
author = 'lys',
author_email = '2322349003@qq.com',
description = 'cppcuda_tutorial',
long_description = 'cppcuda_tutorial',
ext_modules = [#需要build的代码在哪里
CUDAExtension (
name = 'cppcuda_tutorial',
sources = sources,#指定代码
include_dirs = include_dirs,#找到头文件
extra_compile_args = {'cxx': ['-02'],
'nvcc': ['-02']}#减少code容量
)#代码指定
],
cmdclass = {
'build_ext' : BuildExtension#需要build
}
)
pip install . 进行build
修改test.py
import torch #必须先引入torch才可以引入自建c++文件
import cppcuda_tutorial
import time
if __name__ == '__main__':
feats = torch.ones(2)
points = torch.zeros(2)
out = cppcuda_tutorial.trilinear_interpolation(feats, points)
print(out)
报错:
RuntimeError: featsmust be a CUDA tensor
设置
feats = torch.ones(2, device = 'cuda')
points = torch.zeros(2, device = 'cuda')
运行成功:
tensor([1., 1.], device='cuda:0')
至此:成功将trilinear从C++中转移到了cuda上
编写cuda核函数
构建一个初始化和输出
const int N = feats.size(0), F = feats.size(2);//初始化N和F
// feat_interp = torch.zeros(N, F, dtype = torch.float32, device = 'cuda:0')
torch::Tensor feat_interp = torch::zeros({N, F}, feats.options());//输出,形态一样,device一样
//torch::zeros({N, F}, torch::dtype(torch::kInt32).device(feats.device));//如果形态和输出不一样,生成形态位int32,放在指定device
先定义threads,最多只有三个维度
通过threads计算block:此处blocks(2, 1) ,有公式
//进行平行运算
//定义grid大小 只要知道需要几个thread即可,最多有三个平行运算维度
const dim3 threads(16, 16); // 256
const dim3 blocks ((N + threads.x - 1) / threads.x, (F + threads.y - 1) / threads.y);//计算出blocks数量
//AT_DISPATCH_INTEGRAL_TYPES//整数运算
//正式lanuch kernel,固定写法,如果要半精度加速AT_DISPATCH_FLOATING_TYPES_HALF
//符点数运算包含float32和float64,参数1:输出沿用的类型,参数2:报错显示的函数名
AT_DISPATCH_FLOATING_TYPES(feats.type(), "trilinear_fw_cu",
([&] {
//进行正式的三线性内插运算,取input计算完之后丢到output,scalar_t代表形态
trilinear_fw_kernel<scalar_t><<<blocks, threads>>>(
//scalar_t:形态类型,也可以直接指定为float;3:维度;torch::RestrictPtrTraits:引入的所有元素独立存储;size_t>():根据scalar_t指定类型;
feats.packed_accesor<scalar_t, 3, torch::RestrictPtrTraits, size_t>(),//输入
points.packed_accesor<scalar_t, 2, torch::RestrictPtrTraits, size_t>(),//输入
feat_interp.packed_accesor<scalar_t, 2, torch::RestrictPtrTraits, size_t>()//输出
);
}));
正式写lanuch kernel
template <typename scalar_t>
//global:代表这个函数在cpu叫出,在gpu执行
//host:代表在cpu执行;devcie:代表在GPU上执行
__global__ void trilinear_fw_kernel(
const torch::PackedTensorAccessor<scalar_t, 3, torch::RestrictPtrTraits, size_t> feats,
const torch::PackedTensorAccessor<scalar_t, 2, torch::RestrictPtrTraits, size_t> points,
torch::PackedTensorAccessor<scalar_t, 2, torch::RestrictPtrTraits, size_t> feat_interp
) {
//计算thread编号的公式
const int n = blockIdx.x * blockDim.x + threadIdx.x;
const int f = blockIdx.y * blockDim.y + threadIdx.y;
//去除不必要的thread
if (n >= feats.size(0) || f >= feats.size(2)) return;
//point -1~1 归一化
const scalar_t u = (points[n][0] + 1) / 2;
const scalar_t v = (points[n][1] + 1) / 2;
const scalar_t w = (points[n][2] + 1) / 2;
//权重
const scalar_t a = (1-v)*(1-w);
const scalar_t b = (1-v)*w;
const scalar_t c = v * (1-w);
const scalar_t d = 1 - a - b - c;
//计算出的结果丢到相对应位置
feat_interp[n][f] = (1-u)*(a*feats[n][0][f] +
b*feats[n][1][f] +
c*feats[n][2][f] +
d*feats[n][3][f]) +
u*(a*feats[n][4][f] +
b*feats[n][5][f] +
c*feats[n][6][f] +
d*feats[n][7][f]);
}
每次写好 要重新build
在python中将cuda叫出来
#定义python计算三线性插值
def trilinear_interpolation_py(feats, points):
"""
Inputs:
feats: (N, 8, F)
points: (N, 3) local coordinates in [-1, 1]
Outputs:
feats_interp: (N, F)
"""
u = (points[:, 0:1]+1)/2
v = (points[:, 1:2]+1)/2
w = (points[:, 2:3]+1)/2
a = (1-v)*(1-w)
b = (1-v)*w
c = v*(1-w)
d = 1-a-b-c
feats_interp = (1-u)*(a*feats[:, 0] +
b*feats[:, 1] +
c*feats[:, 2] +
d*feats[:, 3]) + \
u*(a*feats[:, 4] +
b*feats[:, 5] +
c*feats[:, 6] +
d*feats[:, 7])
return feats_interp
if __name__ == '__main__':
N = 65536; F = 256
feats = torch.rand(N, 8, F, device = 'cuda')
points = torch.rand(N, 3, device = 'cuda')*2 - 1
# 循环多次求平均值
for _ in range(100):
t = time.time()
out_cuda = cppcuda_tutorial.trilinear_interpolation(feats, points)
#等待cuda运算完成
torch.cuda.synchronize()
print('cuda time', time.time() - t)
t = time.time()
out_py = trilinear_interpolation_py(feats, points)
torch.cuda.synchronize()
print('pytorch time', time.time() - t)
print(torch.allclose(out_py, out_cuda))#allclose判断结果是否大致相同
cuda time 0.004991054534912109
pytorch time 0.010361909866333008
True
深度学习过程中:forward:由input(feats, points)得到output
训练中需要将loss传回input,backward:cuda本身不能提供此功能
if __name__ == '__main__':
N = 65536; F = 256
feats = torch.rand(N, 8, F, device = 'cuda').requires_grad_()
points = torch.rand(N, 3, device = 'cuda')*2 - 1
out_py = trilinear_interpolation_py(feats, points)
print(out_py.requires_grad)
cuda 反向传播 先计算出一个Loss
1.计算所有output对于所有trainable input的偏微分
Loss对output进行偏微分
修改.cu函数
torch::Tensor trilinear_bw_cu(
const torch::Tensor dL_dfeat_interp,
const torch::Tensor feats,
const torch::Tensor points
){
const int N = feats.size(0), F = feats.size(2);
torch::Tensor dL_dfeats = torch::empty({N, 8, F}, feats.options());
const dim3 threads(16, 16);
const dim3 blocks((N+threads.x-1)/threads.x, (F+threads.y-1)/threads.y);
AT_DISPATCH_FLOATING_TYPES(feats.type(), "trilinear_bw_cu",
([&] {
trilinear_bw_kernel<scalar_t><<<blocks, threads>>>(
dL_dfeat_interp.packed_accessor<scalar_t, 2, torch::RestrictPtrTraits, size_t>(),
feats.packed_accessor<scalar_t, 3, torch::RestrictPtrTraits, size_t>(),
points.packed_accessor<scalar_t, 2, torch::RestrictPtrTraits, size_t>(),
dL_dfeats.packed_accessor<scalar_t, 3, torch::RestrictPtrTraits, size_t>()
);
}));
return dL_dfeats;
}
修改kernel函数,包括里面的公式
template <typename scalar_t>
__global__ void trilinear_bw_kernel(
const torch::PackedTensorAccessor<scalar_t, 2, torch::RestrictPtrTraits, size_t> dL_dfeat_interp,
const torch::PackedTensorAccessor<scalar_t, 3, torch::RestrictPtrTraits, size_t> feats,
const torch::PackedTensorAccessor<scalar_t, 2, torch::RestrictPtrTraits, size_t> points,
torch::PackedTensorAccessor<scalar_t, 3, torch::RestrictPtrTraits, size_t> dL_dfeats
){
const int n = blockIdx.x * blockDim.x + threadIdx.x;
const int f = blockIdx.y * blockDim.y + threadIdx.y;
if (n>=feats.size(0) || f>=feats.size(2)) return;
// point -1~1
const scalar_t u = (points[n][0]+1)/2;
const scalar_t v = (points[n][1]+1)/2;
const scalar_t w = (points[n][2]+1)/2;
const scalar_t a = (1-v)*(1-w);
const scalar_t b = (1-v)*w;
const scalar_t c = v*(1-w);
const scalar_t d = 1-a-b-c;
dL_dfeats[n][0][f] = (1-u)*a*dL_dfeat_interp[n][f];
dL_dfeats[n][1][f] = (1-u)*b*dL_dfeat_interp[n][f];
dL_dfeats[n][2][f] = (1-u)*c*dL_dfeat_interp[n][f];
dL_dfeats[n][3][f] = (1-u)*d*dL_dfeat_interp[n][f];
dL_dfeats[n][4][f] = u*a*dL_dfeat_interp[n][f];
dL_dfeats[n][5][f] = u*b*dL_dfeat_interp[n][f];
dL_dfeats[n][6][f] = u*c*dL_dfeat_interp[n][f];
dL_dfeats[n][7][f] = u*d*dL_dfeat_interp[n][f];
}
.cu修改完成后,要在C++中定义一个调cuda的trilinear_interpolation_bw函数,使其返回值是trilinear_bw_cu函数,成功调用。 并且要提供python调C++的桥梁。
torch::Tensor trilinear_interpolation_bw(
const torch::Tensor dL_dfeat_interp,
const torch::Tensor feats,
const torch::Tensor points
) {
CHECK_INPUT(dL_dfeat_interp);
CHECK_INPUT(feats);
CHECK_INPUT(points);
return trilinear_bw_cu(dL_dfeat_interp, feats, points);
}
//提供一个python调c++的桥梁
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
//""python名称,c++函数
m.def("trilinear_interpolation_fw", &trilinear_interpolation_fw);
m.def("trilinear_interpolation_bw", &trilinear_interpolation_bw);
}
C++加入后,要在utils.h头文件里声明trilinear_bw_cu函数, pip install .,完成build
torch::Tensor trilinear_bw_cu(
const torch::Tensor dL_dfeat_interp,
const torch::Tensor feats,
const torch::Tensor points
) ;
2.用torch.autograd.Function包装fw及bw处理
现在因为有了fw和bw,不能直接调用函数,需要进行使用包装torch.autograd.Function
class Trilinear_interpolation_cuda(torch.autograd.Function):
@staticmethod
#ctx负责储存反向传播时需要的一些值
def forward(ctx, feats, points):
feats_interp = cppcuda_tutorial.trilinear_interpolation_fw(feats, points)
#因为反向传播需要用到这两个值,所以保存原本的input
ctx.save_for_backward(feats, points)
return feats_interp
@staticmethod
# dL_dfeat_interp是已知,backward中的函数要和fw的输出对应
def backward(ctx, dL_dfeat_interp):
feats, points = ctx.saved_tensors
dL_dfeats = cppcuda_tutorial.trilinear_interpolation_bw(dL_dfeat_interp.contiguous(), feats, points)
# 回传 要和fw的输入个数决定 此处points为固定值,所以返回None
return dL_dfeats, None
测试正向传播和反向传播的时间
if __name__ == '__main__':
N = 65536; F = 256
rand = torch.rand(N, 8, F, device = 'cuda')
feats = rand.clone().requires_grad_()
feats2 = rand.clone().requires_grad_()
points = torch.rand(N, 3, device = 'cuda')*2 - 1
t = time.time()
# 有反向传播后,就不能用cuda原本自带的函数了
# out_cuda = cppcuda_tutorial.trilinear_interpolation_fw(feats2, points)
out_cuda = Trilinear_interpolation_cuda.apply(feats2, points)
#等待cuda运算完成
torch.cuda.synchronize()
print('cuda fw time', time.time() - t, 's')
t = time.time()
out_py = trilinear_interpolation_py(feats, points)
torch.cuda.synchronize()
print('pytorch fw time', time.time() - t, 's')
print('fw all close', torch.allclose(out_py, out_cuda))#allclose判断结果是否大致相同
t = time.time()
loss2 = out_cuda.sum()
loss2.backward()
print('cuda bw time', time.time() - t, 's')
t = time.time()
loss = out_py.sum()
loss.backward()
print('pytorch bw time', time.time() - t, 's')
#检查返回梯度是否相同
print('bw all close', torch.allclose(feats.grad, feats2.grad))
cuda fw time 0.010978221893310547 s
pytorch fw time 0.008731603622436523 s
fw all close True
cuda bw time 0.0006296634674072266 s
pytorch bw time 0.0011365413665771484 s
bw all close True
至此,完成三线性插值子的计算加速。