TensorRT Samples: CharRNN

最新推荐文章于 2024-07-08 23:34:59 发布

fengbingchun

最新推荐文章于 2024-07-08 23:34:59 发布

阅读量2.4k

点赞数 1

分类专栏： CUDA/TensorRT Caffe

本文链接：https://blog.csdn.net/fengbingchun/article/details/78584365

版权

Caffe 同时被 2 个专栏收录

51 篇文章 18 订阅

订阅专栏

CUDA/TensorRT

46 篇文章 29 订阅

订阅专栏

关于TensorRT的介绍可以参考： http://blog.csdn.net/fengbingchun/article/details/78469551

以下是参考TensorRT 2.1.2中的sampleCharRNN.cpp文件改写的测试代码，文件(charrnn.cpp)内容如下：

#include <assert.h>
#include <string>
#include <string.h>
#include <fstream>
#include <iostream>
#include <tuple>
#include <map>
#include <sstream>
#include <vector>
#include <algorithm>

#include <NvInfer.h>
#include <NvUtils.h>
#include <cuda_runtime_api.h>

#include "common.hpp"

// reference: TensorRT-2.1.2/samples/sampleMNIST/sampleCharRNN.cpp
// demonstrates how to generate a simple RNN based on the charRNN network using the PTB dataset

namespace {

// Information describing the network:
// int: layer count, batch size, hidden size, seq size, data size, output size
// string: input blob name, hidden in blob name, cell in blob name, output blob name, hidden out blob name, cell out blob name
typedef std::tuple<int, int, int, int, int, int, std::string, std::string, std::string, std::string, std::string, std::string> NET_INFO;

// These mappings came from training with tensorflow 0.12.1
static std::map<char, int> char_to_id{{'#', 40},
    { '$', 31}, { '\'', 28}, { '&', 35}, { '*', 49},
    { '-', 32}, { '/', 48}, { '.', 27}, { '1', 37},
    { '0', 36}, { '3', 39}, { '2', 41}, { '5', 43},
    { '4', 47}, { '7', 45}, { '6', 46}, { '9', 38},
    { '8', 42}, { '<', 22}, { '>', 23}, { '\0', 24},
    { 'N', 26}, { '\\', 44}, { ' ', 0}, { 'a', 3},
    { 'c', 13}, { 'b', 20}, { 'e', 1}, { 'd', 12},
    { 'g', 18}, { 'f', 15}, { 'i', 6}, { 'h', 9},
    { 'k', 17}, { 'j', 30}, { 'm', 14}, { 'l', 10},
    { 'o', 5}, { 'n', 4}, { 'q', 33}, { 'p', 16},
    { 's', 7}, { 'r', 8}, { 'u', 11}, { 't', 2},
    { 'w', 21}, { 'v', 25}, { 'y', 19}, { 'x', 29},
    { 'z', 34}
};

// A mapping from index to character.
static std::vector<char> id_to_char{{' ', 'e', 't', 'a',
    'n', 'o', 'i', 's', 'r', 'h', 'l', 'u', 'd', 'c',
    'm', 'f', 'p', 'k', 'g', 'y', 'b', 'w', '<', '>',
    '\0', 'v', 'N', '.', '\'', 'x', 'j', '$', '-', 'q',
    'z', '&', '0', '1', '9', '3', '#', '2', '8', '5',
    '\\', '7', '6', '4', '/', '*'}};

// Our weight files are in a very simple space delimited format.
std::map<std::string, nvinfer1::Weights> loadWeights(const std::string& file)
{
    std::map<std::string, nvinfer1::Weights> weightMap;
    std::ifstream input(file);
    if (!input.is_open()) { fprintf(stderr, "Unable to load weight file: %s\n", file.c_str()); return weightMap;}
    int32_t count;
    input >> count;
    if (count <= 0) { fprintf(stderr, "Invalid weight map file: %d\n", count); return weightMap; }
    while (count--) {
        nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0};
        uint32_t type, size;
        std::string name;
        input >> name >> std::dec >> type >> size;
        wt.type = static_cast<nvinfer1::DataType>(type);
        if (wt.type == nvinfer1::DataType::kFLOAT) {
            uint32_t *val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
            for (uint32_t x = 0, y = size; x < y; ++x) {
                input >> std::hex >> val[x];
            }
            wt.values = val;
        } else if (wt.type == nvinfer1::DataType::kHALF) {
            uint16_t *val = reinterpret_cast<uint16_t*>(malloc(sizeof(val) * size));
            for (uint32_t x = 0, y = size; x < y; ++x) {
                input >> std::hex >> val[x];
            }
            wt.values = val;
        }
        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

// Reshape plugin to feed RNN into FC layer correctly.
class Reshape : public nvinfer1::IPlugin {
public:
	Reshape(size_t size) : mSize(size) {} 
	Reshape(const void*buf, size_t size)
    {
        assert(size == sizeof(mSize));
        mSize = *static_cast<const size_t*>(buf);
    }

	int getNbOutputs() const override {	return 1; }
	int initialize() override {	return 0; }
	void terminate() override {}
	size_t getWorkspaceSize(int) const override { return 0;	}

	int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream)
    {
       cudaMemcpyAsync(static_cast<float*>(outputs[0]),
                   static_cast<const float*>(inputs[0]),
                   sizeof(float) * mSize * batchSize, cudaMemcpyDefault, stream);
        return 0;
    }

	size_t getSerializationSize() override
    {
        return sizeof(mSize);
    }

	void serialize(void* buffer) override
    {
        (*static_cast<size_t*>(buffer)) = mSize;

    }

	void configure(const nvinfer1::Dims*, int, const nvinfer1::Dims*, int, int)	override { }

    // The RNN outputs in {L, N, C}, but FC layer needs {C, 1, 1}, so we can convert RNN
    // output to {L*N, C, 1, 1} and TensorRT will handle the rest.
	nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) override
	{
        assert(nbInputDims == 1 && index == 0 && inputs[index].nbDims == 3);
		return nvinfer1::DimsNCHW(inputs[index].d[1] * inputs[index].d[0], inputs[index].d[2], 1, 1);
	}

private:
    size_t mSize{0};
};

class PluginFactory : public nvinfer1::IPluginFactory
{
public:
	// deserialization plugin implementation
	nvinfer1::IPlugin* createPlugin(const char* layerName, const void* serialData, size_t serialLength) override
	{
        assert(!strncmp(layerName, "reshape", 7));
        if (!mPlugin) mPlugin = new Reshape(serialData, serialLength);
        return mPlugin;
    }

    void destroyPlugin()
    {
        if (mPlugin) delete mPlugin;
        mPlugin = nullptr;
    }

private:
    Reshape *mPlugin{nullptr};
}; // PluginFactory
	
// TensorFlow weight parameters for BasicLSTMCell
nvinfer1::Weights convertRNNWeights(nvinfer1::Weights input, const NET_INFO& info)
{
    float* ptr = static_cast<float*>(malloc(sizeof(float)*input.count));
    int indir[4]{ 1, 2, 0, 3 };
    int order[5]{ 0, 1, 4, 2, 3};
    int dims[5]{std::get<0>(info), 2, 4, std::get<2>(info), std::get<2>(info)};
    nvinfer1::utils::reshapeWeights(input, dims, order, ptr, 5);
    nvinfer1::utils::transposeSubBuffers(ptr, nvinfer1::DataType::kFLOAT, std::get<0>(info) * 2, std::get<2>(info) * std::get<2>(info), 4);
    int subMatrix = std::get<2>(info) * std::get<2>(info);
    int layerOffset = 8 * subMatrix;
    for (int z = 0; z < std::get<0>(info); ++z) {
        nvinfer1::utils::reorderSubBuffers(ptr + z * layerOffset, indir, 4, subMatrix * sizeof(float));
        nvinfer1::utils::reorderSubBuffers(ptr + z * layerOffset + 4 * subMatrix, indir, 4, subMatrix * sizeof(float));
    }

    return nvinfer1::Weights{input.type, ptr, input.count};
}

// TensorFlow bias parameters for BasicLSTMCell
nvinfer1::Weights convertRNNBias(nvinfer1::Weights input, const NET_INFO& info)
{
    float* ptr = static_cast<float*>(malloc(sizeof(float)*input.count*2));
    std::fill(ptr, ptr + input.count*2, 0);
    const float* iptr = static_cast<const float*>(input.values);
    int indir[4]{ 1, 2, 0, 3 };
    for (int z = 0, y = 0; z < std::get<0>(info); ++z)
        for (int x = 0; x < 4; ++x, ++y)
            std::copy(iptr + y * std::get<2>(info) , iptr + (y + 1) * std::get<2>(info), ptr + (z * 8 + indir[x]) * std::get<2>(info));

    return nvinfer1::Weights{input.type, ptr, input.count*2};
}

// The fully connected weights from tensorflow are transposed compared to the order that tensorRT expects them to be in.
nvinfer1::Weights transposeFCWeights(nvinfer1::Weights input, const NET_INFO& info)
{
    float* ptr = static_cast<float*>(malloc(sizeof(float)*input.count));
    const float* iptr = static_cast<const float*>(input.values);
    assert(input.count == std::get<2>(info) * std::get<5>(info));
    for (int z = 0; z < std::get<2>(info); ++z)
        for (int x = 0; x < std::get<5>(info); ++x)
            ptr[x * std::get<2>(info) + z] = iptr[z * std::get<5>(info) + x];

    return nvinfer1::Weights{input.type, ptr, input.count};
}

int APIToModel(std::map<std::string, nvinfer1::Weights> &weightMap, nvinfer1::IHostMemory** modelStream, const NET_INFO& info, Logger logger)
{
    // create the builder
    nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(logger);

    // create the model to populate the network, then set the outputs and create an engine
    nvinfer1::INetworkDefinition* network = builder->createNetwork();

    auto data = network->addInput(std::get<6>(info).c_str(), nvinfer1::DataType::kFLOAT, nvinfer1::DimsCHW{ std::get<3>(info), std::get<1>(info), std::get<4>(info)});
    CHECK(data != nullptr);

    auto hiddenIn = network->addInput(std::get<7>(info).c_str(), nvinfer1::DataType::kFLOAT, nvinfer1::DimsCHW{ std::get<0>(info), std::get<1>(info), std::get<2>(info)});
    CHECK(hiddenIn != nullptr);

    auto cellIn = network->addInput(std::get<8>(info).c_str(), nvinfer1::DataType::kFLOAT, nvinfer1::DimsCHW{ std::get<0>(info), std::get<1>(info), std::get<2>(info)});
    CHECK(cellIn != nullptr);

    // Create an RNN layer w/ 2 layers and 512 hidden states
    auto tfwts = weightMap["rnnweight"];
    nvinfer1::Weights rnnwts = convertRNNWeights(tfwts, info);
    auto tfbias = weightMap["rnnbias"];
    nvinfer1::Weights rnnbias = convertRNNBias(tfbias, info);

    auto rnn = network->addRNN(*data, std::get<0>(info), std::get<2>(info), std::get<3>(info),
            nvinfer1::RNNOperation::kLSTM, nvinfer1::RNNInputMode::kLINEAR, nvinfer1::RNNDirection::kUNIDIRECTION, rnnwts, rnnbias);
    CHECK(rnn != nullptr);
    rnn->getOutput(0)->setName("RNN output");
    rnn->setHiddenState(*hiddenIn);
    if (rnn->getOperation() == nvinfer1::RNNOperation::kLSTM)
        rnn->setCellState(*cellIn);
    
    Reshape reshape(std::get<3>(info) * std::get<1>(info) * std::get<2>(info));
    nvinfer1::ITensor *ptr = rnn->getOutput(0);
    auto plugin = network->addPlugin(&ptr, 1, reshape);
    plugin->setName("reshape");

    // Add a second fully connected layer with 50 outputs.
    auto tffcwts = weightMap["rnnfcw"];
    auto wts = transposeFCWeights(tffcwts, info);
    auto bias = weightMap["rnnfcb"];
    auto fc = network->addFullyConnected(*plugin->getOutput(0), std::get<5>(info), wts, bias);
    CHECK(fc != nullptr);
    fc->getOutput(0)->setName("FC output");

    // Add a softmax layer to determine the probability.
    auto prob = network->addSoftMax(*fc->getOutput(0));
    CHECK(prob != nullptr);
    prob->getOutput(0)->setName(std::get<9>(info).c_str());
    network->markOutput(*prob->getOutput(0));
    rnn->getOutput(1)->setName(std::get<10>(info).c_str());
    network->markOutput(*rnn->getOutput(1));
    if (rnn->getOperation() == nvinfer1::RNNOperation::kLSTM) {
        rnn->getOutput(2)->setName(std::get<11>(info).c_str());
        network->markOutput(*rnn->getOutput(2));
    }

    // Build the engine
    builder->setMaxBatchSize(1);
    builder->setMaxWorkspaceSize(1 << 25);

    // Store the transformed weights in the weight map so the memory can be properly released later.
    weightMap["rnnweight2"] = rnnwts;
    weightMap["rnnbias2"] = rnnbias;
    weightMap["rnnfcw2"] = wts;

    auto engine = builder->buildCudaEngine(*network);
    CHECK(engine != nullptr);
    // we don't need the network any more
    network->destroy();

    // serialize the engine, then close everything down
    (*modelStream) = engine->serialize();
    engine->destroy();
    builder->destroy();

    return 0;
}

void stepOnce(float** data, void** buffers, int* sizes, int* indices,
        int numBindings, cudaStream_t& stream, nvinfer1::IExecutionContext &context)
{
    for (int z = 0, w = numBindings/2; z < w; ++z)
        cudaMemcpyAsync(buffers[indices[z]], data[z], sizes[z] * sizeof(float), cudaMemcpyHostToDevice, stream);

    // Execute asynchronously
    context.enqueue(1, buffers, stream, nullptr);

    // DMA the input from the GPU
    for (int z = numBindings/2, w = numBindings; z < w; ++z)
        cudaMemcpyAsync(data[z], buffers[indices[z]], sizes[z] * sizeof(float), cudaMemcpyDeviceToHost, stream);

    // Copy Ct/Ht to the Ct-1/Ht-1 slots.
    cudaMemcpyAsync(data[1], buffers[indices[4]], sizes[1] * sizeof(float), cudaMemcpyDeviceToHost, stream);
    cudaMemcpyAsync(data[2], buffers[indices[5]], sizes[2] * sizeof(float), cudaMemcpyDeviceToHost, stream);
}

bool doInference(nvinfer1::IExecutionContext& context, const std::string& input, const std::string& expected, std::map<std::string, nvinfer1::Weights>&weightMap, const NET_INFO& info)
{
    const nvinfer1::ICudaEngine& engine = context.getEngine();
    // We have 6 outputs for LSTM, this needs to be changed to 4 for any other RNN type
    static const int numBindings = 6;
    assert(engine.getNbBindings() == numBindings);
    void* buffers[numBindings];
    float* data[numBindings];
    std::fill(buffers, buffers + numBindings, nullptr);
    std::fill(data, data + numBindings, nullptr);
    const char* names[numBindings] = {std::get<6>(info).c_str(), std::get<7>(info).c_str(), std::get<8>(info).c_str(),
                                    std::get<9>(info).c_str(), std::get<10>(info).c_str(), std::get<11>(info).c_str() };
    int indices[numBindings];
    std::fill(indices, indices + numBindings, -1);
    int sizes[numBindings] = { std::get<3>(info) * std::get<1>(info) * std::get<4>(info),
                                std::get<0>(info) * std::get<1>(info) * std::get<2>(info),
                                std::get<0>(info) * std::get<1>(info) * std::get<2>(info),
                                std::get<5>(info),
                                std::get<0>(info) * std::get<1>(info) * std::get<2>(info),
                                std::get<0>(info) * std::get<1>(info) * std::get<2>(info) };

    for (int x = 0; x < numBindings; ++x) {
        // In order to bind the buffers, we need to know the names of the input and output tensors.
        // note that indices are guaranteed to be less than IEngine::getNbBindings()
        indices[x] = engine.getBindingIndex(names[x]);
        if (indices[x] == -1) continue;
        // create GPU buffers and a stream
        assert(indices[x] < numBindings);
        cudaMalloc(&buffers[indices[x]], sizes[x] * sizeof(float));
        data[x] = new float[sizes[x]];
    }
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    // Initialize input/hidden/cell state to zero
    for (int x = 0; x < numBindings; ++x) std::fill(data[x], data[x] + sizes[x], 0.0f);

    auto embed = weightMap["embed"];
    std::string genstr;
    assert(std::get<1>(info) == 1 && "This code assumes batch size is equal to 1.");
    // Seed the RNN with the input.
    for (auto &a : input) {
        std::copy(reinterpret_cast<const float*>(embed.values) + char_to_id[a]*std::get<4>(info),
                reinterpret_cast<const float*>(embed.values) + char_to_id[a]*std::get<4>(info) + std::get<4>(info),
                data[0]);
        stepOnce(data, buffers, sizes, indices, 6, stream, context);
        cudaStreamSynchronize(stream);
        genstr.push_back(a);
    }
    // Now that we have gone through the initial sequence, lets make sure that we get the sequence out that
    // we are expecting.
    for (size_t x = 0, y = expected.size(); x < y; ++x) {
        std::copy(reinterpret_cast<const float*>(embed.values) + char_to_id[*genstr.rbegin()]*std::get<4>(info),
                reinterpret_cast<const float*>(embed.values) + char_to_id[*genstr.rbegin()]*std::get<4>(info) + std::get<4>(info),
                data[0]);

        stepOnce(data, buffers, sizes, indices, 6, stream, context);
        cudaStreamSynchronize(stream);

		float* probabilities = reinterpret_cast<float*>(data[indices[3]]);
		ptrdiff_t idx = std::max_element(probabilities, probabilities + sizes[3]) - probabilities;
        genstr.push_back(id_to_char[idx]);
    }

    fprintf(stdout, "Received: %s\n", genstr.c_str() + input.size());

    // release the stream and the buffers
    cudaStreamDestroy(stream);
    for (int x = 0; x < numBindings; ++x) {
        cudaFree(buffers[indices[x]]);
        if (data[x]) delete [] data[x];
    }

    return genstr == (input + expected);
}

} // namespace

int test_charrnn()
{
    const NET_INFO info(2, 1, 512, 1, 512, 50, "data", "hiddenIn", "cellIn", "prob", "hiddenOut", "cellOut");
    Logger logger; // multiple instances of IRuntime and/or IBuilder must all use the same logger
    // create a model using the API directly and serialize it to a stream
    nvinfer1::IHostMemory* modelStream{ nullptr };

    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights("models/char-rnn.wts");
    APIToModel(weightMap, &modelStream, info, logger);

    const std::vector<std::string> in_strs {"customer serv", "business plans", "help", "slightly under", "market",
                            "holiday cards", "bring it", "what time", "the owner thinks", "money can be use"};
    const std::vector<std::string> out_strs { "es and the", " to be a", "en and", "iting the company", "ing and",
                        " the company", " company said it will", "d and the company", "ist with the", "d to be a"};
    CHECK(in_strs.size() == out_strs.size());

    PluginFactory pluginFactory;

    nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(logger);
    nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(modelStream->data(), modelStream->size(), &pluginFactory);
    nvinfer1::IExecutionContext* context = engine->createExecutionContext();

    for (int num = 0; num < in_strs.size(); ++num) {
        bool pass {false};
        fprintf(stdout, "RNN Warmup: %s, Expect: %s\n", in_strs[num].c_str(), out_strs[num].c_str());
        pass = doInference(*context, in_strs[num], out_strs[num], weightMap, info);
        if (!pass) fprintf(stderr, "Failure!\n");
    }

    if (modelStream) modelStream->destroy();
    for (auto& mem : weightMap) {
        free((void*)(mem.second.values));
    }

    // destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();
    pluginFactory.destroyPlugin();

    return 0;
}

执行结果如下：

测试代码编译步骤如下(ReadMe.txt)：

在Linux下通过CMake编译TensorRT_Test中的测试代码步骤：
1. 将终端定位到CUDA_Test/prj/linux_tensorrt_cmake，依次执行如下命令：
	$ mkdir build
	$ cd build
	$ cmake ..
	$ make (生成TensorRT_Test执行文件)
	$ ln -s ../../../test_data/models  ./ (将models目录软链接到build目录下)
	$ ln -s ../../../test_data/images  ./ (将images目录软链接到build目录下)
	$ ./TensorRT_Test
2. 对于有需要用OpenCV参与的读取图像的操作，需要先将对应文件中的图像路径修改为Linux支持的路径格式

GitHub： https://github.com/fengbingchun/CUDA_Test