python 和 c++交互

curry dog

已于 2022-05-25 20:27:42 修改

阅读量1.1k

点赞数

分类专栏： python c++ 文章标签： python

于 2020-10-29 12:39:35 首次发布

本文链接：https://blog.csdn.net/weixin_39849839/article/details/109345900

版权

c++ 同时被 2 个专栏收录

31 篇文章 0 订阅

订阅专栏

python

7 篇文章 0 订阅

订阅专栏

setup

from distutils.core import setup
setup(
    name='MyApp',         # 应用名
    version='3.0',        # 版本号
    packages=['foo','foo.bar'],   # 包括在安装包内的Python包
    package_dir = {'': 'lib'}
)

在这里插入图片描述
bar1.py

print("this is bar1")

def out() :
    print('this is bar1')

package_dir = {'': 'lib'}

字典中的键值代表了包的名字，空的包名则代表顶层的包。值则代表了对于 setup 脚本所在目录的相对路径，在这个例子中，当你写入 packages = [‘foo’] 时，你其实是指明包位于 lib/foo/ 并且 lib/foo/init.py 这个文件存在。

另一种方法则是直接将 foo 这个包的内容全部放入 lib 而不是在 lib 下建一个 foo 目录，这样的话 foo.bar 包就在 lib/bar 下，在 setup 脚本中，要实现这个效果可以这么写：

package_dir = {‘foo’: ‘lib’}

一个在 package_dir 字典中的 package: dir 映射会对当前包下的所有包都生效，所以 foo.bar 会自动生效. 在这个例子当中， packages = [‘foo’, ‘foo.bar’] 告诉 distutils 去寻找 lib/init.py 和 lib/bar/init.py.

注意，这里必须在setup.py中显示地配置foo.bar这个包，或者通过find_package包含所有的包，否则无法调用foo.bar.bar1

from setuptools import setup, find_packages
setup(
    name='MyApp',         # 应用名
    version='6.0',        # 版本号
    packages=find_packages(where='lib', exclude=(), include=('*',)),   # 包括在安装包内的Python包
    package_dir ={'':'lib'}
)

对于一个相对较小的模块的发布，你可能更想要列出所有模块而不是列出所有的包，尤其是对于那种根目录下就是一个简单模块的类型。对于这种类型你可以这么做：

py_modules = [‘mod1’, ‘pkg.mod2’]

这描述了两个包，一个在根目录下，另一个则在 pkg 目录下。默认的“包：目录”映射关系表明你可以在 setup 脚本所在的路径下找到 mod1.py 和 pkg/mod2.py, 当然，你也可以用 package_dir 选项重写这层映射关系就是了。

package_data = {'': ['*.txt'], 'mypkg': ['data/*.dat'],}

表示包含所有目录下的txt文件和mypkg/data目录下的所有dat文件。

然后，运行python setup.py sdist为模块创建一个源码包

在当前目录下，会创建dist目录，里面有个文件名为foo-1.0.tar.gz，这个就是可以分发的包（如果使用命令python setup.py bdist_egg，那么会在dist目录中生成foo-1.0-py2.7.egg包，setup.py中第一句引入需要改为from setuptools import setup）。使用者拿到这个包后，解压，到foo-1.0目录下执行：python setup.py install，那么，foo.py就会被拷贝到python类路径下，可以被导入使用（如果安装是egg文件，会把egg文件拷贝到dist-packages目录下）。

安装完成后，在另一个工程中

import foo.bar.bar1 as b
b.out()

输出为

this is bar1
this is bar1

scripts

scripts 参数是一个 list，安装包时在该参数中列出的文件会被安装到系统 PATH 路径下。如：

scripts=['bin/foo.sh', 'bar.py']

find_packages

对于简单工程来说，手动增加 packages 参数是容易。而对于复杂的工程来说，可能添加很多的包，这是手动添加就变得麻烦。Setuptools 模块提供了一个 find_packages 函数,它默认在与 setup.py 文件同一目录下搜索各个含有 init.py 的目录做为要添加的包。

find_packages(where='.', exclude=(), include=('*',))

find_packages 函数的第一个参数用于指定在哪个目录下搜索包，参数 exclude 用于指定排除哪些包，参数 include 指出要包含的包。

默认默认情况下 setup.py 文件只在其所在的目录下搜索包。如果不用 find_packages，想要找到其他目录下的包，也可以设置 package_dir 参数，其指定哪些目录下的文件被映射到哪个源码包，如: package_dir={‘’: ‘src’} 表示 “root package” 中的模块都在 src 目录中。

ext_modules

ext_modules 参数用于构建 C 和 C++ 扩展扩展包。其是 Extension 实例的列表，每一个 Extension 实例描述了一个独立的扩展模块，扩展模块可以设置扩展包名，头文件、源文件、链接库及其路径、宏定义和编辑参数等。如：

setup(
    # other arguments here...
    ext_modules=[
        Extension('foo',
                  glob(path.join(here, 'src', '*.c')),
                  libraries = [ 'rt' ],
                  include_dirs=[numpy.get_include()])
    ]
)

cpython

// file: cmath/cmath.c
#include "cmath.h"
int add(int a, int b)
{
    return a + b;
}

# file: pymath/pymath.pyx
cdef extern from "cmath.h":
    int add(int a, int b)

def pyadd(int a, int b):
    return add(a, b)

# file: pymath/setup.py
from distutils.core import setup, Extension
from Cython.Build import cythonize

extension = Extension(
    "pymath",
    ["pymath.pyx"],
    libraries=["cmath"]
)

setup(
    ext_modules=cythonize([extension])
)

import pymath

print(pymath.pyadd(1, 2))

另一个例子
cpluspy.h

#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <string>
#include <vector>
#include <cassert>

using namespace std;

class CplusA {
public:
	CplusA():text("NULL") {}; 
	CplusA(string t):text(t) {};
	~CplusA() {};
	void show_text();
	double multiply(double a, double b);
private:
	string text;
};

double linear_scalar(double w, double x);

vector<double> linear_vector(vector<double> w, vector<double> x);

cpluspy.cpp

#include "cpluspy.h"


void CplusA::show_text(){
    std::cout << "CplusA.text=" << CplusA::text << endl;
    // std::cout << "CplusA.text=" << CplusA::text << endl;
}

double CplusA::multiply(double a, double b){
    return a * b;
}

double linear_scalar(double w, double x){
    return w * x;
}

vector<double> linear_vector(vector<double> w, vector<double> x){
    vector<double> y;
    int num_w = w.size(), num_x = x.size();
    if(num_w != num_x){
        std::cout << "The size of weight vector and input vector must be same!" << std::endl;
        system("pause");
        exit(EXIT_FAILURE);
    }
    for(int i=0; i < num_w; ++i){
        y.emplace_back(w[i] * x[i]);
    }
    return y;
}

cpluspy_cython.pxd

from libcpp.vector cimport vector
from libcpp.string cimport string

cdef extern from "cpluspy.h":
    cdef cppclass CplusA:
        
        CplusA() except + # 如果不定义except +，那么若是在C/C++构造器初始化过程出现异常，python是不会捕获到的。
        CplusA(string) except + 

        void show_text()
        double multiply(double a, double b)
        
    double linear_scalar(double w, double x)
    vector[double] linear_vector(vector[double] w, vector[double] x)

cpluspy_cython.pyx

# distutils: language = c++
from libcpp.string cimport string

from Rectangle cimport Rectangle
from cpluspy_cython cimport CplusA, linear_scalar, linear_vector

cdef class PyCplusA:
    cdef CplusA pycplus_a

    def __cinit__(self, str s):
        # cdef bytes enc_s = bytes(s, encoding = "utf8")
        self.pycplus_a = CplusA(s.encode("utf8"))

    def multiply(self, double a, double b):
        return self.pycplus_a.multiply(a, b)

    def show_text(self):
        self.pycplus_a.show_text()
    
def py_linear_scalar(double w, double x):
    return linear_scalar(w, x)

def py_linear_vector(vector[double] w, vector[double] x):
    return linear_vector(w, x)


def run_test():
    cls_a = PyCplusA(s="Hello?")
    print("multiply(5, 3)=%.0f"%cls_a.multiply(5, 3))
    cls_a.show_text()
    print("py_linear_scalar(5., 3.)=%.0f"%py_linear_scalar(5., 3.))
    print("py_linear_vector\n([1., 2., 3., 4., 5.], \n[5., 4., 3., 2., 1.]):")
    print(py_linear_vector([1., 2., 3., 4., 5.], [5., 4., 3., 2., 1.]))

setup.py

from distutils.core import setup
from distutils.extension import Extension
from Cython.Build import cythonize

ext_modules = [Extension(name="cpluspy_cython", sources=["cpluspy_cython.pyx", "cpluspy.cpp"], language="c++")]

setup(name="cpluspy_cython", 
    version="0.1",
    description="a demo of dist setup c/c++ extensions.",
    py_modules=["my_module"], # 本地有一个无关的`my_module.py`文件，也一并安装了
    ext_modules=cythonize(ext_modules),
    )

.pyx和.pxd的关系类似于cpp和.h的关系。

torch c++ extension

要求：torch1.7

lltm_cuda.cpp

#include <torch/extension.h>

#include <vector>

// CUDA forward declarations

std::vector<torch::Tensor> lltm_cuda_forward(
    torch::Tensor input,
    torch::Tensor weights,
    torch::Tensor bias,
    torch::Tensor old_h,
    torch::Tensor old_cell);

std::vector<torch::Tensor> lltm_cuda_backward(
    torch::Tensor grad_h,
    torch::Tensor grad_cell,
    torch::Tensor new_cell,
    torch::Tensor input_gate,
    torch::Tensor output_gate,
    torch::Tensor candidate_cell,
    torch::Tensor X,
    torch::Tensor gate_weights,
    torch::Tensor weights);

// C++ interface

#define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

std::vector<torch::Tensor> lltm_forward(
    torch::Tensor input,
    torch::Tensor weights,
    torch::Tensor bias,
    torch::Tensor old_h,
    torch::Tensor old_cell) {
  CHECK_INPUT(input);
  CHECK_INPUT(weights);
  CHECK_INPUT(bias);
  CHECK_INPUT(old_h);
  CHECK_INPUT(old_cell);

  return lltm_cuda_forward(input, weights, bias, old_h, old_cell);
}

std::vector<torch::Tensor> lltm_backward(
    torch::Tensor grad_h,
    torch::Tensor grad_cell,
    torch::Tensor new_cell,
    torch::Tensor input_gate,
    torch::Tensor output_gate,
    torch::Tensor candidate_cell,
    torch::Tensor X,
    torch::Tensor gate_weights,
    torch::Tensor weights) {
  CHECK_INPUT(grad_h);
  CHECK_INPUT(grad_cell);
  CHECK_INPUT(input_gate);
  CHECK_INPUT(output_gate);
  CHECK_INPUT(candidate_cell);
  CHECK_INPUT(X);
  CHECK_INPUT(gate_weights);
  CHECK_INPUT(weights);

  return lltm_cuda_backward(
      grad_h,
      grad_cell,
      new_cell,
      input_gate,
      output_gate,
      candidate_cell,
      X,
      gate_weights,
      weights);
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &lltm_forward, "LLTM forward (CUDA)");
  m.def("backward", &lltm_backward, "LLTM backward (CUDA)");
}

lltm_cuda_kernel.cu

#include <torch/extension.h>

#include <cuda.h>
#include <cuda_runtime.h>

#include <vector>

template <typename scalar_t>
__device__ __forceinline__ scalar_t sigmoid(scalar_t z) {
  return 1.0 / (1.0 + exp(-z));
}

template <typename scalar_t>
__device__ __forceinline__ scalar_t d_sigmoid(scalar_t z) {
  const auto s = sigmoid(z);
  return (1.0 - s) * s;
}

template <typename scalar_t>
__device__ __forceinline__ scalar_t d_tanh(scalar_t z) {
  const auto t = tanh(z);
  return 1 - (t * t);
}

template <typename scalar_t>
__device__ __forceinline__ scalar_t elu(scalar_t z, scalar_t alpha = 1.0) {
  return fmax(0.0, z) + fmin(0.0, alpha * (exp(z) - 1.0));
}

template <typename scalar_t>
__device__ __forceinline__ scalar_t d_elu(scalar_t z, scalar_t alpha = 1.0) {
  const auto e = exp(z);
  const auto d_relu = z < 0.0 ? 0.0 : 1.0;
  return d_relu + (((alpha * (e - 1.0)) < 0.0) ? (alpha * e) : 0.0);
}




template <typename scalar_t>
__global__ void lltm_cuda_forward_kernel(
    const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> gates,
    const torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> old_cell,
    torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> new_h,
    torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> new_cell,
    torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> input_gate,
    torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> output_gate,
    torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> candidate_cell) {
  //batch index
  const int n = blockIdx.y;
  // column index
  const int c = blockIdx.x * blockDim.x + threadIdx.x;
  if (c < gates.size(2)){
    input_gate[n][c] = sigmoid(gates[n][0][c]);
    output_gate[n][c] = sigmoid(gates[n][1][c]);
    candidate_cell[n][c] = elu(gates[n][2][c]);
    new_cell[n][c] =
        old_cell[n][c] + candidate_cell[n][c] * input_gate[n][c];
    new_h[n][c] = tanh(new_cell[n][c]) * output_gate[n][c];
  }
}

std::vector<torch::Tensor> lltm_cuda_forward(
    torch::Tensor input,
    torch::Tensor weights,
    torch::Tensor bias,
    torch::Tensor old_h,
    torch::Tensor old_cell) {
  auto X = torch::cat({old_h, input}, /*dim=*/1);
  auto gate_weights = torch::addmm(bias, X, weights.transpose(0, 1));

  const auto batch_size = old_cell.size(0);
  const auto state_size = old_cell.size(1);

  auto gates = gate_weights.reshape({batch_size, 3, state_size});
  auto new_h = torch::zeros_like(old_cell);
  auto new_cell = torch::zeros_like(old_cell);
  auto input_gate = torch::zeros_like(old_cell);
  auto output_gate = torch::zeros_like(old_cell);
  auto candidate_cell = torch::zeros_like(old_cell);

  const int threads = 1024;
  const dim3 blocks((state_size + threads - 1) / threads, batch_size);

  AT_DISPATCH_FLOATING_TYPES(gates.type(), "lltm_forward_cuda", ([&] {
    lltm_cuda_forward_kernel<scalar_t><<<blocks, threads>>>(
        gates.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
        old_cell.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
        new_h.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
        new_cell.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
        input_gate.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
        output_gate.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
        candidate_cell.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>());
  }));

  return {new_h, new_cell, input_gate, output_gate, candidate_cell, X, gates};
}

template <typename scalar_t>
__global__ void lltm_cuda_backward_kernel(
    torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> d_old_cell,
    torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> d_gates,
    const torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> grad_h,
    const torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> grad_cell,
    const torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> new_cell,
    const torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> input_gate,
    const torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> output_gate,
    const torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> candidate_cell,
    const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> gate_weights) {
  //batch index
  const int n = blockIdx.y;
  // column index
  const int c = blockIdx.x * blockDim.x + threadIdx.x;
  if (c < d_gates.size(2)){
    const auto d_output_gate = tanh(new_cell[n][c]) * grad_h[n][c];
    const auto d_tanh_new_cell = output_gate[n][c] * grad_h[n][c];
    const auto d_new_cell =
        d_tanh(new_cell[n][c]) * d_tanh_new_cell + grad_cell[n][c];


    d_old_cell[n][c] = d_new_cell;
    const auto d_candidate_cell = input_gate[n][c] * d_new_cell;
    const auto d_input_gate = candidate_cell[n][c] * d_new_cell;

    d_gates[n][0][c] =
        d_input_gate * d_sigmoid(gate_weights[n][0][c]);
    d_gates[n][1][c] =
        d_output_gate * d_sigmoid(gate_weights[n][1][c]);
    d_gates[n][2][c] =
        d_candidate_cell * d_elu(gate_weights[n][2][c]);
  }
}

std::vector<torch::Tensor> lltm_cuda_backward(
    torch::Tensor grad_h,
    torch::Tensor grad_cell,
    torch::Tensor new_cell,
    torch::Tensor input_gate,
    torch::Tensor output_gate,
    torch::Tensor candidate_cell,
    torch::Tensor X,
    torch::Tensor gates,
    torch::Tensor weights) {
  auto d_old_cell = torch::zeros_like(new_cell);
  auto d_gates = torch::zeros_like(gates);

  const auto batch_size = new_cell.size(0);
  const auto state_size = new_cell.size(1);

  const int threads = 1024;
  const dim3 blocks((state_size + threads - 1) / threads, batch_size);

  AT_DISPATCH_FLOATING_TYPES(X.type(), "lltm_forward_cuda", ([&] {  // lamda函数  AT_DISPATCH_FLOATING_TYPES用于判断具体类型float,int
    lltm_cuda_backward_kernel<scalar_t><<<blocks, threads>>>(
        d_old_cell.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
        d_gates.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
        grad_h.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
        grad_cell.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
        new_cell.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
        input_gate.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
        output_gate.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
        candidate_cell.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
        gates.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>());
  }));

  auto d_gate_weights = d_gates.reshape({batch_size, 3*state_size});
  auto d_weights = d_gate_weights.t().mm(X);
  auto d_bias = d_gate_weights.sum(/*dim=*/0, /*keepdim=*/true);

  auto d_X = d_gate_weights.mm(weights);
  auto d_old_h = d_X.slice(/*dim=*/1, 0, state_size);
  auto d_input = d_X.slice(/*dim=*/1, state_size);

  return {d_old_h, d_input, d_weights, d_bias, d_old_cell, d_gates};
}

有两种方法在python文件中运行这些torch extension, 第一种是通过setup.py文件

from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension

setup(
    name='lltm',
    ext_modules=[
        CUDAExtension('lltm_cuda', [
            'lltm_cuda.cpp',
            'lltm_cuda_kernel.cu',
        ])
    ],
    cmdclass={
        'build_ext': BuildExtension
    })

安装三个文件放于同一个文件夹，运行python setup.py install
用法是先import torch 再import lltm_cuda

第二种是利用jit即时编译

from torch.utils.cpp_extension import load

lltm = load(name='lltm', sources=['lltm_cuda.cpp', 'lltm_cuda_kernel.cu'])

文件填写相对于main函数所在py文件的地址。

在clion 调试torch_extension的c++代码

把c++代码组织成cmake项目的形式

cmake_minimum_required (VERSION 3.8)
project(ransac LANGUAGES CUDA CXX)
set(CMAKE_CUDA_COMPILER "/usr/local/cuda/bin/nvcc")
#set(CMAKE_PREFIX_PATH "/home/wanboyan/anaconda3/lib/python3.6/site-packages/torch/share/cmake/Torch/:${CMAKE_PREFIX_PATH}")

find_package(CUDA REQUIRED)
find_package(Torch REQUIRED)
find_library(TORCH_PYTHON_LIBRARY torch_python PATHS "${TORCH_INSTALL_PREFIX}/lib")

add_subdirectory(extern/pybind11)
include_directories(/home/wanboyan/anaconda3/include/python3.6m/)

#find_package(pybind11 REQUIRED)

pybind11_add_module(ransac_voting SHARED src/ransac_voting.cpp src/ransac_voting_kernel.cu)
target_link_libraries(ransac_voting "${TORCH_LIBRARIES}" "${TORCH_PYTHON_LIBRARY}")

debug torch_extension
在即时编译选项中加入‘-g’

ransac_voting = load(name='ransac_voting',
                     sources=['../lib/ransac_voting_gpu_layer/src/ransac_voting.cpp', '../lib/ransac_voting_gpu_layer/src/ransac_voting_kernel.cu'],
                     extra_cflags=['-g'])

或者通过修改setup.py

setup(
    name='ransac_voting',
    ext_modules=[
        CUDAExtension('ransac_voting', [
            './src/ransac_voting.cpp',
            './src/ransac_voting_kernel.cu'
        ],
        extra_compile_args={'cxx': ['-g'],'nvcc': ['-O2']})
    ],
    cmdclass={
        'build_ext': BuildExtension
    }
)

再通过attach to native process的方式进行断点调试，注意这里不能使用attch with python

在python 中使用extension

import math
import torch

# Our module!

```python
import lltm_cpp

class LLTMFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input, weights, bias, old_h, old_cell):
        outputs = lltm_cpp.forward(input, weights, bias, old_h, old_cell)
        new_h, new_cell = outputs[:2]
        variables = outputs[1:] + [weights]
        ctx.save_for_backward(*variables)

        return new_h, new_cell

    @staticmethod
    def backward(ctx, grad_h, grad_cell):
        outputs = lltm_cpp.backward(
            grad_h.contiguous(), grad_cell.contiguous(), *ctx.saved_tensors)
        d_old_h, d_input, d_weights, d_bias, d_old_cell = outputs
        return d_input, d_weights, d_bias, d_old_h, d_old_cell


class LLTM(torch.nn.Module):
    def __init__(self, input_features, state_size):
        super(LLTM, self).__init__()
        self.input_features = input_features
        self.state_size = state_size
        self.weights = torch.nn.Parameter(
            torch.empty(3 * state_size, input_features + state_size))
        self.bias = torch.nn.Parameter(torch.empty(3 * state_size))
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1.0 / math.sqrt(self.state_size)
        for weight in self.parameters():
            weight.data.uniform_(-stdv, +stdv)

    def forward(self, input, state):
        return LLTMFunction.apply(input, self.weights, self.bias, *state)