参考:https://blog.csdn.net/qq_27637315/article/details/79114633
windows增加op节点:
https://github.com/tensorflow/models/issues/1103
tensorflow自定义GPU版本op节点
由于前段时间导师布置了一个任务,要修改损失函数,但是这个损失函数在tensorflow自带的库中又没有,想了很多办法,试来试去找不到一个解决方案,因为tensorflow是把框架和数据分开的,所以直接用python写出来的函数是不能用的,只能定义一个节点来调用才行,所以就自然想到先跑一个gpu版本的kernel例程啦,网上cpu版本的教程很多,但是gpu版本的却比较的少,官网的教程极课学院有讲,但我觉得讲的太复杂,反正我是看了一遍没看懂,好了,开始正文。本次例程实现的是将输入tensor中的数字加一输出。
步骤1:写一个kernel
文件名:cuda_op_kernel.cu.cc 代码如下:
- #if GOOGLE_CUDA
- #define EIGEN_USE_GPU
- #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
- __global__ void AddOneKernel(const int* in, const int N, int* out) {
- for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
- i += blockDim.x * gridDim.x) {
- out[i] = in[i] + 1;
- }
- }
- void AddOneKernelLauncher(const int* in, const int N, int* out) {
- AddOneKernel<<<32, 256>>>(in, N, out);
- }
- #endif
步骤2:编写cpp程序
文件名:cuda_op_kernel.cc 代码如下:
- #include "tensorflow/core/framework/op.h"
- #include "tensorflow/core/framework/op_kernel.h"
- using namespace tensorflow;
- REGISTER_OP("AddOne")
- .Input("input: int32")
- .Output("output: int32")
- .Doc(R"doc(
- Adds 1 to all elements of the tensor.
- output: A Tensor.
- output = input + 1
- )doc");
- void AddOneKernelLauncher(const int* in, const int N, int* out);
- class AddOneOp : public OpKernel {
- public:
- explicit AddOneOp(OpKernelConstruction* context) : OpKernel(context) {}
- void Compute(OpKernelContext* context) override {
- // Grab the input tensor
- const Tensor& input_tensor = context->input(0);
- auto input = input_tensor.flat<int32>();
- // Create an output tensor
- Tensor* output_tensor = NULL;
- OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
- &output_tensor));
- auto output = output_tensor->template flat<int32>();
- // Set all but the first element of the output tensor to 0.
- const int N = input.size();
- // Call the cuda kernel launcher
- AddOneKernelLauncher(input.data(), N, output.data());
- }
- };
- REGISTER_KERNEL_BUILDER(Name("AddOne").Device(DEVICE_GPU), AddOneOp);
在上面两个文件夹下打开终端(ubuntu),输入以下命令:
- TF_INC=$(python -c 'import tensorflow as tf; print(tf.sysconfig.get_include())')
g
++
-
std
=
c
++
11
-
shared
cuda_op_kernel
.cc
-
o
cuda_op_kernel
.so
-
fPIC
-
I
$TF_INC
-
O2
-
D_GLIBCXX_USE_CXX11_ABI
=
0
nvcc
-
std
=
c
++
11
-
c
-
o
cuda_op_kernel
.cu
.o
cuda_op_kernel
.cu
.cc
\
-
I
$TF_INC
-
D
GOOGLE_CUDA
=
1
-
x
cu
-
Xcompiler
-
fPIC
g
++
-
std
=
c
++
11
-
shared
-
o
cuda_op_kernel
.so
cuda_op_kernel
.cc
\
cuda_op_kernel
.cu
.o
-
I
$TF_INC
-
fPIC
-
lcudart
当你执行到第四个命令的时候很可能会碰到这样一个报错:
- /usr/bin/ld: 找不到 -lcudart
- collect2: error: ld returned 1 exit status
- g++ -std=c++11 -shared -o cuda_op_kernel.so cuda_op_kernel.cc cuda_op_kernel.cu.o -L/usr/local/cuda/targets/x86_64-linux/lib -I $TF_INC -fPIC -lcudart
然后我写了一个测试文件
步骤4:测试
文件名:test.py 代码如下:
- import tensorflow as tf
- cuda_op_module = tf.load_op_library('./cuda_op_kernel.so')
- with tf.Session(''):
- x=cuda_op_module.add_one([[1, 2], [3, 4]]).eval()
- print x
[[2 3]
[4 5]]