Jeston Nx使用Tensor RT 加速神经网络步骤

乐乐侠666

已于 2023-07-13 18:00:05 修改

阅读量687

点赞数 1

文章标签：神经网络人工智能深度学习

于 2023-07-13 17:57:02 首次发布

本文链接：https://blog.csdn.net/m0_59322384/article/details/131707296

版权

在使用本教程之前，默认NX已经装好Python，以及Pytorch。等深度学习环境。

1.准备好要用的文件包，tensorrt x（由国外大神制作的tensorrt 的接口），yolo v5 源码，注意，yolo v5 源码的版本一定要与tensorrt x 的版本对应。yolo v5训练的权重。

2.检查是否安装CUDA。

打开终端，在终端中输入命令nvcc -V，若出现以下界面就说明cuda已经安装完成。

若未出现以上界面，请重新配置cuda地址路径。步骤如下：

2.1打开终端输入命令

 sudo vi .bashrc

2.2拉到最后添加这些（此处注意自己的路径，以及cuda版本，一般来说不用改）

export PATH=/usr/local/cuda-10.2/bin${PATH:+:${PATH}}

export LD_LIBRARY_PATH=/usr/local/cuda10.2/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export CUDA_ROOT=/usr/local/cuda

2.3

 source ~/.bashrc

2.4再次使用命令nvcc -V查看配置是否成功。

3.打开终端激活自己的深度学习环境，这一步默认使用本教程的人已经装好了anaconda、python、pytorch等环境，没装的可以自行百度。

激活自己的深度学习环境 conda activate 你的深度学习环境名，这里以我的举例，如下图：

4.将下载下来的权重，yolov5s.pt放在yolov5-5.0的根目录里。

5.在自己的深度学习环境里，也就是第3步的终端对话框里，输入命令。需要等一会儿。结束后会在yolov5-5.0这个文件夹内多一个yolov5s.wts这个文件。

python3 gen_wts.py --w yolov5s.pt

6.打开tensorrtx这个文件夹，将刚刚生成的yolov5s.wts拷贝到yolo v5这个文件夹里。(tensorrtx-yolov5-v5.0\yolov5)

7.在当前路径中找到yololayer.h文件，打开之后可以修改一些参数包括但不限于（类别、数量、图片大小等等）

8.后在当前环境下，右键选择打开终端，输入

mkdir bulid (目的：创建编译的文件夹，将编译产生的文件都放在这个文件夹里)

cd build (目的：进入编译的这个文件夹)

cmake ..(目的：构建项目)

make （目的：编译）

9.进入编译文件

上述为编译正常进行的界面，若没有出示上述界面，提示找不到头文件是有可能没有安装tensorrt。使用命令

sudo apt install nvidia-jetpack

等安装完成后，再次进入编译

10.编译完成后使用如下命令编译引擎

sudo ./yolov5 -s yolov5s.wts yolov5s.engine s

11.接下来在tensorrtx-yolov5-v5.0\yolov5中创建一个文件夹Tenting，放入一张你想要测试的图片进行测试

使用命令

sudo ./yolov5 -d yolov5s.engine ../Testing

12.调用摄像头，警告：在进行下面操作前，请见前面我们所提到过的YoloV5.cpp文件进行备份，拷贝到其他文件夹，重新编译yolov5.

使用命令

sudo gedit yolov5.cpp

将yolov5.cpp的代码换成一下部分

#include <iostream>

#include <chrono>

#include "cuda_utils.h"

#include "logging.h"

#include "common.hpp"

#include "utils.h"

#include "calibrator.h"



#define USE_FP16  // set USE_INT8 or USE_FP16 or USE_FP32

#define DEVICE 0  // GPU id

#define NMS_THRESH 0.4

#define CONF_THRESH 0.5

#define BATCH_SIZE 1



// stuff we know about the network and the input/output blobs

static const int INPUT_H = Yolo::INPUT_H;

static const int INPUT_W = Yolo::INPUT_W;

static const int CLASS_NUM = Yolo::CLASS_NUM;

static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) + 1;  // we assume the yololayer outputs no more than MAX_OUTPUT_BBOX_COUNT boxes that conf >= 0.1

const char* INPUT_BLOB_NAME = "data";

const char* OUTPUT_BLOB_NAME = "prob";

static Logger gLogger;



#修改为自己的类别

char *my_classes[]={ "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",

         "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",

         "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",

         "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard","surfboard",

         "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",

         "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",

         "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",

         "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",

         "hair drier", "toothbrush" };



static int get_width(int x, float gw, int divisor = 8) {

    //return math.ceil(x / divisor) * divisor

    if (int(x * gw) % divisor == 0) {

        return int(x * gw);

    }

    return (int(x * gw / divisor) + 1) * divisor;

}



static int get_depth(int x, float gd) {

    if (x == 1) {

        return 1;

    }

    else {

        return round(x * gd) > 1 ? round(x * gd) : 1;

    }

}

 #创建engine和network

ICudaEngine* build_engine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, float& gd, float& gw, std::string& wts_name) {

    INetworkDefinition* network = builder->createNetworkV2(0U);



    // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME

    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ 3, INPUT_H, INPUT_W });

    assert(data);



    std::map<std::string, Weights> weightMap = loadWeights(wts_name);



    /* ------ yolov5 backbone------ */

    auto focus0 = focus(network, weightMap, *data, 3, get_width(64, gw), 3, "model.0");

    auto conv1 = convBlock(network, weightMap, *focus0->getOutput(0), get_width(128, gw), 3, 2, 1, "model.1");

    auto bottleneck_CSP2 = C3(network, weightMap, *conv1->getOutput(0), get_width(128, gw), get_width(128, gw), get_depth(3, gd), true, 1, 0.5, "model.2");

    auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), get_width(256, gw), 3, 2, 1, "model.3");

    auto bottleneck_csp4 = C3(network, weightMap, *conv3->getOutput(0), get_width(256, gw), get_width(256, gw), get_depth(9, gd), true, 1, 0.5, "model.4");

    auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), get_width(512, gw), 3, 2, 1, "model.5");

    auto bottleneck_csp6 = C3(network, weightMap, *conv5->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(9, gd), true, 1, 0.5, "model.6");

    auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), get_width(1024, gw), 3, 2, 1, "model.7");

    auto spp8 = SPP(network, weightMap, *conv7->getOutput(0), get_width(1024, gw), get_width(1024, gw), 5, 9, 13, "model.8");



    /* ------ yolov5 head ------ */

    auto bottleneck_csp9 = C3(network, weightMap, *spp8->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), false, 1, 0.5, "model.9");

    auto conv10 = convBlock(network, weightMap, *bottleneck_csp9->getOutput(0), get_width(512, gw), 1, 1, 1, "model.10");



    auto upsample11 = network->addResize(*conv10->getOutput(0));

    assert(upsample11);

    upsample11->setResizeMode(ResizeMode::kNEAREST);

    upsample11->setOutputDimensions(bottleneck_csp6->getOutput(0)->getDimensions());



    ITensor* inputTensors12[] = { upsample11->getOutput(0), bottleneck_csp6->getOutput(0) };

    auto cat12 = network->addConcatenation(inputTensors12, 2);

    auto bottleneck_csp13 = C3(network, weightMap, *cat12->getOutput(0), get_width(1024, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.13");

    auto conv14 = convBlock(network, weightMap, *bottleneck_csp13->getOutput(0), get_width(256, gw), 1, 1, 1, "model.14");



    auto upsample15 = network->addResize(*conv14->getOutput(0));

    assert(upsample15);

    upsample15->setResizeMode(ResizeMode::kNEAREST);

    upsample15->setOutputDimensions(bottleneck_csp4->getOutput(0)->getDimensions());



    ITensor* inputTensors16[] = { upsample15->getOutput(0), bottleneck_csp4->getOutput(0) };

    auto cat16 = network->addConcatenation(inputTensors16, 2);



    auto bottleneck_csp17 = C3(network, weightMap, *cat16->getOutput(0), get_width(512, gw), get_width(256, gw), get_depth(3, gd), false, 1, 0.5, "model.17");



    // yolo layer 0

    IConvolutionLayer* det0 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]);

    auto conv18 = convBlock(network, weightMap, *bottleneck_csp17->getOutput(0), get_width(256, gw), 3, 2, 1, "model.18");

    ITensor* inputTensors19[] = { conv18->getOutput(0), conv14->getOutput(0) };

    auto cat19 = network->addConcatenation(inputTensors19, 2);

    auto bottleneck_csp20 = C3(network, weightMap, *cat19->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.20");

    //yolo layer 1

    IConvolutionLayer* det1 = network->addConvolutionNd(*bottleneck_csp20->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]);

    auto conv21 = convBlock(network, weightMap, *bottleneck_csp20->getOutput(0), get_width(512, gw), 3, 2, 1, "model.21");

    ITensor* inputTensors22[] = { conv21->getOutput(0), conv10->getOutput(0) };

    auto cat22 = network->addConcatenation(inputTensors22, 2);

    auto bottleneck_csp23 = C3(network, weightMap, *cat22->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), false, 1, 0.5, "model.23");

    IConvolutionLayer* det2 = network->addConvolutionNd(*bottleneck_csp23->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]);



    auto yolo = addYoLoLayer(network, weightMap, "model.24", std::vector<IConvolutionLayer*>{det0, det1, det2});

    yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME);

    network->markOutput(*yolo->getOutput(0));



    // Build engine

    builder->setMaxBatchSize(maxBatchSize);

    config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB

#if defined(USE_FP16)

    config->setFlag(BuilderFlag::kFP16);

#elif defined(USE_INT8)

    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;

    assert(builder->platformHasFastInt8());

    config->setFlag(BuilderFlag::kINT8);

    Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, INPUT_W, INPUT_H, "./coco_calib/", "int8calib.table", INPUT_BLOB_NAME);

    config->setInt8Calibrator(calibrator);

#endif



    std::cout << "Building engine, please wait for a while..." << std::endl;

    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);

    std::cout << "Build engine successfully!" << std::endl;



    // Don't need the network any more

    network->destroy();



    // Release host memory

    for (auto& mem : weightMap)

    {

        free((void*)(mem.second.values));

    }



    return engine;

}



ICudaEngine* build_engine_p6(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, float& gd, float& gw, std::string& wts_name) {

    INetworkDefinition* network = builder->createNetworkV2(0U);



    // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME

    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ 3, INPUT_H, INPUT_W });

    assert(data);



    std::map<std::string, Weights> weightMap = loadWeights(wts_name);



    /* ------ yolov5 backbone------ */

    auto focus0 = focus(network, weightMap, *data, 3, get_width(64, gw), 3, "model.0");

    auto conv1 = convBlock(network, weightMap, *focus0->getOutput(0), get_width(128, gw), 3, 2, 1, "model.1");

    auto c3_2 = C3(network, weightMap, *conv1->getOutput(0), get_width(128, gw), get_width(128, gw), get_depth(3, gd), true, 1, 0.5, "model.2");

    auto conv3 = convBlock(network, weightMap, *c3_2->getOutput(0), get_width(256, gw), 3, 2, 1, "model.3");

    auto c3_4 = C3(network, weightMap, *conv3->getOutput(0), get_width(256, gw), get_width(256, gw), get_depth(9, gd), true, 1, 0.5, "model.4");

    auto conv5 = convBlock(network, weightMap, *c3_4->getOutput(0), get_width(512, gw), 3, 2, 1, "model.5");

    auto c3_6 = C3(network, weightMap, *conv5->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(9, gd), true, 1, 0.5, "model.6");

    auto conv7 = convBlock(network, weightMap, *c3_6->getOutput(0), get_width(768, gw), 3, 2, 1, "model.7");

    auto c3_8 = C3(network, weightMap, *conv7->getOutput(0), get_width(768, gw), get_width(768, gw), get_depth(3, gd), true, 1, 0.5, "model.8");

    auto conv9 = convBlock(network, weightMap, *c3_8->getOutput(0), get_width(1024, gw), 3, 2, 1, "model.9");

    auto spp10 = SPP(network, weightMap, *conv9->getOutput(0), get_width(1024, gw), get_width(1024, gw), 3, 5, 7, "model.10");

    auto c3_11 = C3(network, weightMap, *spp10->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), false, 1, 0.5, "model.11");



    /* ------ yolov5 head ------ */

    auto conv12 = convBlock(network, weightMap, *c3_11->getOutput(0), get_width(768, gw), 1, 1, 1, "model.12");

    auto upsample13 = network->addResize(*conv12->getOutput(0));

    assert(upsample13);

    upsample13->setResizeMode(ResizeMode::kNEAREST);

    upsample13->setOutputDimensions(c3_8->getOutput(0)->getDimensions());

    ITensor* inputTensors14[] = { upsample13->getOutput(0), c3_8->getOutput(0) };

    auto cat14 = network->addConcatenation(inputTensors14, 2);

    auto c3_15 = C3(network, weightMap, *cat14->getOutput(0), get_width(1536, gw), get_width(768, gw), get_depth(3, gd), false, 1, 0.5, "model.15");



    auto conv16 = convBlock(network, weightMap, *c3_15->getOutput(0), get_width(512, gw), 1, 1, 1, "model.16");

    auto upsample17 = network->addResize(*conv16->getOutput(0));

    assert(upsample17);

    upsample17->setResizeMode(ResizeMode::kNEAREST);

    upsample17->setOutputDimensions(c3_6->getOutput(0)->getDimensions());

    ITensor* inputTensors18[] = { upsample17->getOutput(0), c3_6->getOutput(0) };

    auto cat18 = network->addConcatenation(inputTensors18, 2);

    auto c3_19 = C3(network, weightMap, *cat18->getOutput(0), get_width(1024, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.19");



    auto conv20 = convBlock(network, weightMap, *c3_19->getOutput(0), get_width(256, gw), 1, 1, 1, "model.20");

    auto upsample21 = network->addResize(*conv20->getOutput(0));

    assert(upsample21);

    upsample21->setResizeMode(ResizeMode::kNEAREST);

    upsample21->setOutputDimensions(c3_4->getOutput(0)->getDimensions());

    ITensor* inputTensors21[] = { upsample21->getOutput(0), c3_4->getOutput(0) };

    auto cat22 = network->addConcatenation(inputTensors21, 2);

    auto c3_23 = C3(network, weightMap, *cat22->getOutput(0), get_width(512, gw), get_width(256, gw), get_depth(3, gd), false, 1, 0.5, "model.23");



    auto conv24 = convBlock(network, weightMap, *c3_23->getOutput(0), get_width(256, gw), 3, 2, 1, "model.24");

    ITensor* inputTensors25[] = { conv24->getOutput(0), conv20->getOutput(0) };

    auto cat25 = network->addConcatenation(inputTensors25, 2);

    auto c3_26 = C3(network, weightMap, *cat25->getOutput(0), get_width(1024, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.26");



    auto conv27 = convBlock(network, weightMap, *c3_26->getOutput(0), get_width(512, gw), 3, 2, 1, "model.27");

    ITensor* inputTensors28[] = { conv27->getOutput(0), conv16->getOutput(0) };

    auto cat28 = network->addConcatenation(inputTensors28, 2);

    auto c3_29 = C3(network, weightMap, *cat28->getOutput(0), get_width(1536, gw), get_width(768, gw), get_depth(3, gd), false, 1, 0.5, "model.29");



    auto conv30 = convBlock(network, weightMap, *c3_29->getOutput(0), get_width(768, gw), 3, 2, 1, "model.30");

    ITensor* inputTensors31[] = { conv30->getOutput(0), conv12->getOutput(0) };

    auto cat31 = network->addConcatenation(inputTensors31, 2);

    auto c3_32 = C3(network, weightMap, *cat31->getOutput(0), get_width(2048, gw), get_width(1024, gw), get_depth(3, gd), false, 1, 0.5, "model.32");



    /* ------ detect ------ */

    IConvolutionLayer* det0 = network->addConvolutionNd(*c3_23->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.33.m.0.weight"], weightMap["model.33.m.0.bias"]);

    IConvolutionLayer* det1 = network->addConvolutionNd(*c3_26->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.33.m.1.weight"], weightMap["model.33.m.1.bias"]);

    IConvolutionLayer* det2 = network->addConvolutionNd(*c3_29->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.33.m.2.weight"], weightMap["model.33.m.2.bias"]);

    IConvolutionLayer* det3 = network->addConvolutionNd(*c3_32->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.33.m.3.weight"], weightMap["model.33.m.3.bias"]);



    auto yolo = addYoLoLayer(network, weightMap, "model.33", std::vector<IConvolutionLayer*>{det0, det1, det2, det3});

    yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME);

    network->markOutput(*yolo->getOutput(0));



    // Build engine

    builder->setMaxBatchSize(maxBatchSize);

    config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB

#if defined(USE_FP16)

    config->setFlag(BuilderFlag::kFP16);

#elif defined(USE_INT8)

    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;

    assert(builder->platformHasFastInt8());

    config->setFlag(BuilderFlag::kINT8);

    Int8EntropyCalibrator2* calibrator = new Int8EntropyCalibrator2(1, INPUT_W, INPUT_H, "./coco_calib/", "int8calib.table", INPUT_BLOB_NAME);

    config->setInt8Calibrator(calibrator);

#endif



    std::cout << "Building engine, please wait for a while..." << std::endl;

    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);

    std::cout << "Build engine successfully!" << std::endl;



    // Don't need the network any more

    network->destroy();



    // Release host memory

    for (auto& mem : weightMap)

    {

        free((void*)(mem.second.values));

    }



    return engine;

}



void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream, float& gd, float& gw, std::string& wts_name) {

    // Create builder

    IBuilder* builder = createInferBuilder(gLogger);

    IBuilderConfig* config = builder->createBuilderConfig();



    // Create model to populate the network, then set the outputs and create an engine

    ICudaEngine* engine = build_engine(maxBatchSize, builder, config, DataType::kFLOAT, gd, gw, wts_name);

    assert(engine != nullptr);



    // Serialize the engine

    (*modelStream) = engine->serialize();



    // Close everything down

    engine->destroy();

    builder->destroy();

    config->destroy();

}



void doInference(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* input, float* output, int batchSize) {

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host

    CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));

    context.enqueue(batchSize, buffers, stream, nullptr);

    CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));

    cudaStreamSynchronize(stream);

}



bool parse_args(int argc, char** argv, std::string& engine) {

    if (argc < 3) return false;

    if (std::string(argv[1]) == "-v" && argc == 3) {

        engine = std::string(argv[2]);

    }

    else {

        return false;

    }

    return true;

}



int main(int argc, char** argv) {

    cudaSetDevice(DEVICE);



    //std::string wts_name = "";

    std::string engine_name = "";

    //float gd = 0.0f, gw = 0.0f;

    //std::string img_dir;



    if (!parse_args(argc, argv, engine_name)) {

        std::cerr << "arguments not right!" << std::endl;

        std::cerr << "./yolov5 -v [.engine] // run inference with camera" << std::endl;

        return -1;

    }



    std::ifstream file(engine_name, std::ios::binary);

    if (!file.good()) {

        std::cerr << " read " << engine_name << " error! " << std::endl;

        return -1;

    }

    char* trtModelStream{ nullptr };

    size_t size = 0;

    file.seekg(0, file.end);

    size = file.tellg();

    file.seekg(0, file.beg);

    trtModelStream = new char[size];

    assert(trtModelStream);

    file.read(trtModelStream, size);

    file.close();





    // prepare input data ---------------------------

    static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];

    //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)

    //    data[i] = 1.0;

    static float prob[BATCH_SIZE * OUTPUT_SIZE];

    IRuntime* runtime = createInferRuntime(gLogger);

    assert(runtime != nullptr);

    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);

    assert(engine != nullptr);

    IExecutionContext* context = engine->createExecutionContext();

    assert(context != nullptr);

    delete[] trtModelStream;

    assert(engine->getNbBindings() == 2);

    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.

    // Note that indices are guaranteed to be less than IEngine::getNbBindings()

    const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME);

    const int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);

    assert(inputIndex == 0);

    assert(outputIndex == 1);

    // Create GPU buffers on device

    CUDA_CHECK(cudaMalloc(&buffers[inputIndex], BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float)));

    CUDA_CHECK(cudaMalloc(&buffers[outputIndex], BATCH_SIZE * OUTPUT_SIZE * sizeof(float)));

    // Create stream

    cudaStream_t stream;

    CUDA_CHECK(cudaStreamCreate(&stream));



     #读取本地视频

    //cv::VideoCapture capture("/home/nano/Videos/video.mp4");

     #调用本地usb摄像头,我的默认参数为1,如果1报错,可修改为0.

    cv::VideoCapture capture(1);

    if (!capture.isOpened()) {

        std::cout << "Error opening video stream or file" << std::endl;

        return -1;

    }



    int key;

    int fcount = 0;

    while (1)

    {

        cv::Mat frame;

        capture >> frame;

        if (frame.empty())

        {

            std::cout << "Fail to read image from camera!" << std::endl;

            break;

        }

        fcount++;

        //if (fcount < BATCH_SIZE && f + 1 != (int)file_names.size()) continue;

        for (int b = 0; b < fcount; b++) {

            //cv::Mat img = cv::imread(img_dir + "/" + file_names[f - fcount + 1 + b]);

            cv::Mat img = frame;

            if (img.empty()) continue;

            cv::Mat pr_img = preprocess_img(img, INPUT_W, INPUT_H); // letterbox BGR to RGB

            int i = 0;

            for (int row = 0; row < INPUT_H; ++row) {

                uchar* uc_pixel = pr_img.data + row * pr_img.step;

                for (int col = 0; col < INPUT_W; ++col) {

                    data[b * 3 * INPUT_H * INPUT_W + i] = (float)uc_pixel[2] / 255.0;

                    data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = (float)uc_pixel[1] / 255.0;

                    data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = (float)uc_pixel[0] / 255.0;

                    uc_pixel += 3;

                    ++i;

                }

            }

        }



        // Run inference

        auto start = std::chrono::system_clock::now();#获取模型推理开始时间

        doInference(*context, stream, buffers, data, prob, BATCH_SIZE);

        auto end = std::chrono::system_clock::now();#结束时间

        //std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;

        int fps = 1000.0 / std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();

        std::vector<std::vector<Yolo::Detection>> batch_res(fcount);

        for (int b = 0; b < fcount; b++) {

            auto& res = batch_res[b];

            nms(res, &prob[b * OUTPUT_SIZE], CONF_THRESH, NMS_THRESH);

        }

        for (int b = 0; b < fcount; b++) {

            auto& res = batch_res[b];

            //std::cout << res.size() << std::endl;

            //cv::Mat img = cv::imread(img_dir + "/" + file_names[f - fcount + 1 + b]);

            for (size_t j = 0; j < res.size(); j++) {

                cv::Rect r = get_rect(frame, res[j].bbox);

                cv::rectangle(frame, r, cv::Scalar(0x27, 0xC1, 0x36), 2);

                std::string label = my_classes[(int)res[j].class_id];

                cv::putText(frame, label, cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);

                std::string jetson_fps = "FPS: " + std::to_string(fps);

                cv::putText(frame, jetson_fps, cv::Point(11, 80), cv::FONT_HERSHEY_PLAIN, 3, cv::Scalar(0, 0, 255), 2, cv::LINE_AA);

            }

            //cv::imwrite("_" + file_names[f - fcount + 1 + b], img);

        }

        cv::imshow("yolov5", frame);

        key = cv::waitKey(1);

        if (key == 'q') {

            break;

        }

        fcount = 0;

    }



    capture.release();

    // Release stream and buffers

    cudaStreamDestroy(stream);

    CUDA_CHECK(cudaFree(buffers[inputIndex]));

    CUDA_CHECK(cudaFree(buffers[outputIndex]));

    // Destroy the engine

    context->destroy();

    engine->destroy();

    runtime->destroy();

    return 0;

}

按照注释修改代码中的内容后，删去注释，注意不要删掉代码了。一般修改这两部分即可。

最后一步：

在tensorrtx-yolov5-v5.0\yolov5中打开终端

输入cd build

make

再次编译

最后使用一下命令调用即可

sudo ./yolov5 -v yolov5s.engine

若出错请检查摄像头是否插好，yolov5.cpp中设置的摄像头编号是否正确。

注意，每次修改代码文件后都要按照上述步骤重新编译。只有先编译原版及检测图片版本后才能编译摄像头版本，不能跳过，否则会出错。

乐乐侠666

关注

1
点赞
踩
3

收藏

觉得还不错? 一键收藏
2
评论
Jeston Nx使用Tensor RT 加速神经网络步骤

1.准备好要用的文件包，tensorrt x（由国外大神制作的tensorrt 的接口），yolo v5 源码，注意，yolo v5 源码的版本一定要与tensorrt x 的版本对应。yolo v5训练的权重。5.在自己的深度学习环境里，也就是第3步的终端对话框里，输入命令。3.打开终端激活自己的深度学习环境，这一步默认使用本教程的人已经装好了anaconda、python、pytorch等环境，没装的可以自行百度。打开终端，在终端中输入命令nvcc -V，若出现以下界面就说明cuda已经安装完成。
复制链接

扫一扫