gpu的单位表示_以十为单位添加一个GPU操作

最新推荐文章于 2024-06-14 17:26:32 发布

香江不知名前浪

最新推荐文章于 2024-06-14 17:26:32 发布

阅读量276

点赞数

文章标签： gpu的单位表示

版权声明：本文为博主原创文章，遵循 CC 4.0 BY-SA 版权协议，转载请附上原文出处链接和本声明。

本文链接：https://blog.csdn.net/weixin_34119722/article/details/112045420

版权

我试图在TensorFlow中添加一个新的op，松散地跟随this文档。不同的是，我试图实现一个基于GPU的操作。我要添加的操作是来自here(cuda)的cuda操作_op.py公司，库达·欧普_内核.cc，库达·欧普_内核.cu.cc). 我试图在tensorflow之外编译这些代码，并使用tf.load_op_library将它们拉入。我做了一些更改，下面是我的文件：

库达·欧普_内核.cc在#include "tensorflow/core/framework/op.h"

#include "tensorflow/core/framework/shape_inference.h"

#include "tensorflow/core/framework/op_kernel.h"

using namespace tensorflow; // NOLINT(build/namespaces)

REGISTER_OP("AddOne")

.Input("input: int32")

.Output("output: int32")

.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {

c->set_output(0, c->input(0));

return Status::OK();

});

void AddOneKernelLauncher(const int* in, const int N, int* out);

class AddOneOp : public OpKernel {

public:

explicit AddOneOp(OpKernelConstruction* context) : OpKernel(context) {}

void Compute(OpKernelContext* context) override {

// Grab the input tensor

const Tensor& input_tensor = context->input(0);

auto input = input_tensor.flat();

// Create an output tensor

Tensor* output_tensor = NULL;

OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),

&output_tensor));

auto output = output_tensor->template flat();

// Set all but the first element of the output tensor to 0.

const int N = input.size();

// Call the cuda kernel launcher

AddOneKernelLauncher(input.data(), N, output.data());

}

};

REGISTER_KERNEL_BUILDER(Name("AddOne").Device(DEVICE_GPU), AddOneOp);

库达·欧普_内核.cu在

^{pr2}$

在CMakeLists.txt文件在cmake_minimum_required(VERSION 3.5)

#found from running python -c 'import tensorflow as tf; print(tf.sysconfig.get_include())'

include_directories(/usr/local/lib/python3.5/dist-packages/tensorflow/include)

find_package(CUDA)

#set flags based on tutorial

set (CMAKE_CXX_FLAGS "--std=c++11 -fPIC -O2 -D_GLIBCXX_USE_CXX11_ABI=0")

#pass flags to c++ compiler

SET(CUDA_PROPAGATE_HOST_FLAGS ON)

#create library

cuda_add_library(

cuda_op SHARED

src/cuda_op_kernel.cu

src/cuda_op_kernel.cc

OPTIONS -gencode=arch=compute_20,code=sm_20)

#copy test file to build folder

configure_file(src/test.py test.py COPYONLY)

在测试.py在import tensorflow as tf

mod = tf.load_op_library('./libcuda_op.so')

with tf.Session() as sess:

start = [5,4,3,2,1]

print(start)

print(mod.add_one(start).eval())

我能够成功地编译和运行test.py，但输出总是[0 0 0 0 0]。如果我将AddOneKernel<<<32, 256>>>(in, N, out);替换为for (int i = 0; i < N; i++) out[i] = in[i] + 1;，并将DEVICE_GPU替换为DEVICE_CPU，则op输出正确的值[6 5 4 3 2](具有完全相同的CMakeList.txt)。在

你知道如何得到正确的返回值吗？在

香江不知名前浪

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
gpu的单位表示_以十为单位添加一个GPU操作

我试图在TensorFlow中添加一个新的op，松散地跟随this文档。不同的是，我试图实现一个基于GPU的操作。我要添加的操作是来自here(cuda)的cuda操作_op.py公司，库达·欧普_内核.cc，库达·欧普_内核.cu.cc). 我试图在tensorflow之外编译这些代码，并使用tf.load_op_library将它们拉入。我做了一些更改，下面是我的文件：库达·欧普_内核.cc在...
复制链接

扫一扫

评论

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。