pytorch转TensorRT 模型转换（以retinaface为例）

highoooo

已于 2022-03-07 16:12:45 修改

阅读量1.7k

点赞数 1

分类专栏： TensorRT AI C++ 文章标签： python c++

于 2022-03-07 15:51:40 首次发布

本文链接：https://blog.csdn.net/highoooo/article/details/123331447

版权

C++ 同时被 3 个专栏收录

35 篇文章 0 订阅

订阅专栏

27 篇文章 0 订阅

订阅专栏

TensorRT

11 篇文章 0 订阅

订阅专栏

Trail 1 ：.pth->.wts->.engine

1.1 pth文件转为key value的字典文件便于c++解析（.pth->.wts）

def main():
    print('cuda device count: ', torch.cuda.device_count())
    device = 'cuda:0'
    net = torch.load('retinaface.pth')
    net = net.to(device)
    net.eval()
    print('model: ', net)
    #print('state dict: ', net.state_dict().keys())
    tmp = torch.ones(1, 3, 384, 640).to(device)
    print('input: ', tmp)
    out = net(tmp)
    print('output:', out)

    if os.path.exists('retinaface.wts'):
        return
    f = open("retinaface.wts", 'w')
    f.write("{}\n".format(len(net.state_dict().keys())))
    for k,v in net.state_dict().items():
        print('key: ', k)
        print('value: ', v.shape)
        vr = v.reshape(-1).cpu().numpy()
        f.write("{} {}".format(k, len(vr)))
        for vv in vr:
            f.write(" ")
            f.write(struct.pack(">f", float(vv)).hex())
        f.write("\n")

1.2 wts文件转为engine文件

1.2.1 输出流文件.engine 将wts内容序列化后写入（核心函数APIToModel）

IHostMemory* modelStream{nullptr};
APIToModel(BATCH_SIZE, &modelStream); ★★★★★
assert(modelStream != nullptr);
std::ofstream p("arcface-mobilefacenet.engine", std::ios::binary);
if (!p) {
    std::cerr << "could not open plan output file" << std::endl;
    return -1;
}
p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
modelStream->destroy();

1.2.2 APIToModel（核心函数createEngine）

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) {
    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);★★★★★
    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
}

1.2.3 createEngine (核心函数 loadWeights)

ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) {
    INetworkDefinition* network = builder->createNetworkV2(0U);

    // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W});
    assert(data);
	★★★★★
    std::map<std::string, Weights> weightMap = loadWeights("/home/nvidia/Desktop/projects/qys/face_infer/wts_model/arcface-m.wts");
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    auto conv_1 = conv_bn_relu(network, weightMap, *data, "conv_1", 64, 3, 1, 2);
    auto conv_2_dw = conv_bn_relu(network, weightMap, *conv_1->getOutput(0), "conv_2_dw", 64, 3, 1, 1, 64);
    auto conv_23 = DepthWise(network, weightMap, *conv_2_dw->getOutput(0), "dconv_23", 64, 64, 128, 2);
    auto res_3_block0 = DWResidual(network, weightMap, *conv_23->getOutput(0), "res_3_block0", 64, 64, 128, 1);
    auto res_3_block1 = DWResidual(network, weightMap, *res_3_block0->getOutput(0), "res_3_block1", 64, 64, 128, 1);
    auto res_3_block2 = DWResidual(network, weightMap, *res_3_block1->getOutput(0), "res_3_block2", 64, 64, 128, 1);
    auto res_3_block3 = DWResidual(network, weightMap, *res_3_block2->getOutput(0), "res_3_block3", 64, 64, 128, 1);
    auto conv_34 = DepthWise(network, weightMap, *res_3_block3->getOutput(0), "dconv_34", 64, 128, 256, 2);
    auto res_4_block0 = DWResidual(network, weightMap, *conv_34->getOutput(0), "res_4_block0", 128, 128, 256, 1);
    auto res_4_block1 = DWResidual(network, weightMap, *res_4_block0->getOutput(0), "res_4_block1", 128, 128, 256, 1);
    auto res_4_block2 = DWResidual(network, weightMap, *res_4_block1->getOutput(0), "res_4_block2", 128, 128, 256, 1);
    auto res_4_block3 = DWResidual(network, weightMap, *res_4_block2->getOutput(0), "res_4_block3", 128, 128, 256, 1);
    auto res_4_block4 = DWResidual(network, weightMap, *res_4_block3->getOutput(0), "res_4_block4", 128, 128, 256, 1);
    auto res_4_block5 = DWResidual(network, weightMap, *res_4_block4->getOutput(0), "res_4_block5", 128, 128, 256, 1);
    auto conv_45 = DepthWise(network, weightMap, *res_4_block5->getOutput(0), "dconv_45", 128, 128, 512, 2);
    auto res_5_block0 = DWResidual(network, weightMap, *conv_45->getOutput(0), "res_5_block0", 128, 128, 256, 1);
    auto res_5_block1 = DWResidual(network, weightMap, *res_5_block0->getOutput(0), "res_5_block1", 128, 128, 256, 1);
    auto conv_6_sep = conv_bn_relu(network, weightMap, *res_5_block1->getOutput(0), "conv_6sep", 512, 1, 0, 1);
    auto conv_6dw7_7 = conv_bn(network, weightMap, *conv_6_sep->getOutput(0), "conv_6dw7_7", 512, 7, 0, 1, 512);
    IFullyConnectedLayer* fc1 = network->addFullyConnected(*conv_6dw7_7->getOutput(0), 128, weightMap["fc1_weight"], weightMap["pre_fc1_bias"]);
    assert(fc1);
    auto bn1 = addBatchNorm2d(network, weightMap, *fc1->getOutput(0), "fc1", 2e-5);
    assert(bn1);
    bn1->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    network->markOutput(*bn1->getOutput(0));

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
#ifdef USE_FP16
    config->setFlag(BuilderFlag::kFP16);
#endif
    std::cout << "Building engine, please wait for a while..." << std::endl;
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "Build engine successfully!" << std::endl;

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap)
    {
        free((void*) (mem.second.values));
    }

    return engine;
}

1.2.4 loadWeights（读入wts文件）

// TensorRT weight files have a simple space delimited format:
// [type] [size] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file) {
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;

        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

1.2.5 各个module的实现

在这里插入图片描述

Trail 2 ：.pth->.onnx（->.engine）

2.1 pth转onnx

import onnx
import torch
from models.retinaface import RetinaFace

pth_path = "./retinaface_m0.25.pth"
pth_path = "weights/mobilenet0.25_Final.pth"
onnx_path = "onnxs/mobilenet0.25_Final.onnx"
input = torch.randn(1, 3, 384, 640).cuda()

# cfg_mnet = { }

def load_pth(pth_path):
    model = RetinaFace(cfg_mnet)
    model.load_state_dict(torch.load(pth_path))
    return model.cuda()

def pth2onnx(pth_model, onnx_path):
    torch.onnx.export(pth_model,
                      input,
                      onnx_path,
                      verbose=True,
                      input_names=['input'],
                      output_names=['output'])


if __name__ == '__main__':
    pth_model = load_pth(pth_path)
    pth2onnx(pth_model, onnx_path)

highoooo

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
pytorch转TensorRT 模型转换（以retinaface为例）

Trail 1 ：.pth->.wts->.engine1.1 pth文件转为key value的字典文件便于c++解析（.pth->.wts）def main(): print('cuda device count: ', torch.cuda.device_count()) device = 'cuda:0' net = torch.load('retinaface.pth') net = net.to(device) net.eval()
复制链接

扫一扫