【Pytorch FastestDet模型寒武纪200移植分享】

最新推荐文章于 2024-05-04 10:58:55 发布

玉米研究所

最新推荐文章于 2024-05-04 10:58:55 发布

阅读量780

点赞数

分类专栏：深度学习文章标签： python 人工智能深度学习 Powered by 金山文档

本文链接：https://blog.csdn.net/weixin_44112313/article/details/128563980

版权

深度学习专栏收录该内容

4 篇文章 0 订阅

订阅专栏

环境信息

设备：MLU270

网络：FastestDet

模型链接：https://github.com/dog-qiuqiu/FastestDet

参考文档：PyTorch 框架的 Yolov5 移植 – 寒武纪开发者社区 (cambricon.com)

环境准备

下载源码

git clone https://github.com/dog-qiuqiu/FastestDet

安装系统依赖

参看requirements.txt 文件，由于python版本和torch版本差异，软件包对应的版本会有些区别，供参考：

onnx                1.6.0
onnxruntime         1.10.0
onnxsim             0.4.10
opencv-python       3.4.2.17
tqdm                4.64.1
pycocotools         2.0.6

代码修改

修改module/shufflenetv2.py 代码

diff --git a/module/shufflenetv2.py b/module/shufflenetv2.py
index 89d0eb7..80cd875 100644
--- a/module/shufflenetv2.py
+++ b/module/shufflenetv2.py
@@ -54,7 +54,8 @@ class ShuffleV2Block(nn.Module):
             return torch.cat((self.branch_proj(x_proj), self.branch_main(x)), 1)
 
     def channel_shuffle(self, x):
-        batchsize, num_channels, height, width = x.data.size()
+        #batchsize, num_channels, height, width = x.data.size()
+        batchsize, num_channels, height, width = x.detach().size()
         assert (num_channels % 4 == 0)
         x = x.reshape(batchsize * num_channels // 2, 2, height * width)
         x = x.permute(1, 0, 2)

备注：.data 是较早前的操作，后面版本已经使用detach()进行替换

修改utils/tool.py 文件后处理代码

torchvision.ops.batched_nms pytorch 1.13 对应的是 torchvision.ops.boxes.batched_nms

测试发现 gy, gx dtype存在问题，会导致结果错误，转成float()

diff --git a/utils/tool.py b/utils/tool.py
old mode 100644
new mode 100755
index bc7f07a..9cb945c
--- a/utils/tool.py
+++ b/utils/tool.py
@@ -77,6 +77,9 @@ def handle_preds(preds, device, conf_thresh=0.25, nms_thresh=0.45):
 
     # 检测框的坐标
     gy, gx = torch.meshgrid([torch.arange(H), torch.arange(W)])
+    gy = gy.float()
+    gx = gx.float()
+    
     bw, bh = preg[..., 2].sigmoid(), preg[..., 3].sigmoid() 
     bcx = (preg[..., 0].tanh() + gx.to(device)) / W
     bcy = (preg[..., 1].tanh() + gy.to(device)) / H
@@ -113,7 +116,8 @@ def handle_preds(preds, device, conf_thresh=0.25, nms_thresh=0.45):
             b = torch.Tensor(b).to(device)
             c = torch.Tensor(c).squeeze(1).to(device)
             s = torch.Tensor(s).squeeze(1).to(device)
-            keep = torchvision.ops.batched_nms(b, s, c, nms_thresh)
+            # keep = torchvision.ops.batched_nms(b, s, c, nms_thresh)
+            keep =  torchvision.ops.boxes.batched_nms(b, s, c, nms_thresh)
             for i in keep:
                 output.append(temp[i])
         output_bboxes.append(torch.Tensor(output))

添加模型转换代码mlu/gen_unzipmodel.py

该程序主要是为了将pt文件从1.6以上版本降到低版本pytorch

import os
import argparse

import sys
prj_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
print(prj_dir)
sys.path.append(prj_dir)

import torch
from utils.tool import *
from module.detector import Detector

if __name__ == '__main__':
    # 指定训练配置文件
    parser = argparse.ArgumentParser()
    parser.add_argument('--yaml', type=str, default="", help='.yaml config')
    parser.add_argument('--weight', type=str, default=None, help='.weight config')
    opt = parser.parse_args()

    print("run on cpu...")
    device = torch.device("cpu")

    # 解析yaml配置文件
    cfg = LoadYaml(opt.yaml)
    print(cfg)

    # 模型加载
    print("load weight from:%s"%opt.weight)
    model = Detector(cfg.category_num, True).to(device)
    model.load_state_dict(torch.load(opt.weight, map_location=device))
    #sets the module in eval node
    model.eval()

    pt_file="FastestDet_unzip.pt"

    print("save no zipfile ...")
    torch.save(model.state_dict(), pt_file,_use_new_zipfile_serialization=False)
    print("save %s end ..."%pt_file)

添加量化模型代码mlu/mlu_quant.py

基于test.py 进行修改，添加量化配置参数等：

---
#参数增加
parser.add_argument('--qua_weight', type=str,default='FastestDet_intx.pth', help='model.pt path(s)')

---
#量化参数配置
    # 配置量化参数
    import torch_mlu
    import torch_mlu.core.mlu_model as ct
    import torch_mlu.core.mlu_quantize as mlu_quantize
    qconfig={'use_avg':False, 'data_scale':1.0, 'firstconv':False, 'per_channel': False}
    # 调用量化接口
    quantized_net = mlu_quantize.quantize_dynamic_mlu(model,qconfig_spec=qconfig, dtype='int16', gen_quant=True)
    # 设置为推理模式
    quantized_net = quantized_net.eval().float()

    model = quantized_net

---
#模型量化模型
    # 保存量化模型
    print("\n")
    qua_weight = opt.qua_weight
    print("SAVE quantize model:",qua_weight)
    torch.save(model.state_dict(),qua_weight)

添加MLU 运行代码mlu/mlu_test.py

基于test.py 进行修改，主要修改模型加载部分：

---
#模型加载
#注释
# model.load_state_dict(torch.load(opt.weight, map_location=device))

  #配置 MLU core number
    ct.set_core_number(opt.core_number)
    # 设置输入图片的通道顺序，以决定首层卷积对三通道输入的补齐通道顺序。默认是 RGBA 顺序
    #ct.set_input_format(0)
    #配置MLU core类型
    ct.set_core_version(opt.mcore)
    torch.set_grad_enabled(False)

    if opt.fake_device:
        print("fake device mode")
        ct.set_device(-1)

    mlu_device = ct.mlu_device()
    print("run on %s ..."%mlu_device)
    # 加载量化模型
    weight = opt.weight
    quantized_net = torch_mlu.core.mlu_quantize.quantize_dynamic_mlu(model)
    print('weight:',weight)
    state_dict = torch.load(weight)
    quantized_net.load_state_dict(state_dict, strict=False)
    # 设置为推理模式
    quantized_net = quantized_net.eval().float()
    quantized_net.to(mlu_device)

    model = quantized_net

    # 设置在线融合模式
    if opt.jit:
        if opt.save:
            ct.save_as_cambricon(opt.mname)

        example = torch.randn(opt.batch_size, 3, cfg.input_height, cfg.input_width,dtype=torch.float)
        trace_input = torch.randn(1, 3, cfg.input_height, cfg.input_width,dtype=torch.float)

        if opt.half_input:
            print('half_input ')
            trace_input = trace_input.type(torch.HalfTensor)
            example = example.type(torch.HalfTensor)

        print("jit trace example shape",example.shape)
        model = torch.jit.trace(model,trace_input.to(mlu_device),check_trace=False)

        if opt.save or opt.fake_device:
            print("save offline model mname: ",opt.mname)
            model(example.to(mlu_device))
            ct.save_as_cambricon('')
            exit(0)
            
---
#数据部分
#需要将数据拷贝到mlu device上
img = img.to(mlu_device)

---
#输出部分
preds = preds.cpu().type(torch.FloatTensor) if opt.half_input else preds.cpu()

测试验证

转换成no zip 版本模型

 python3 mlu/gen_unzipmodel.py --yaml configs/coco.yaml --weight weights/weight_AP05\:0.253207_280-epoch.pth
 mv FastestDet_unzip.pt

备注：需要在大于1.6 pytorch 版本进行转换

模型验证（CPU）

验证unzip 模型

python3 test.py --yaml configs/coco.yaml --weight mlu/weight/FastestDet_unzip.pt --img data/3.jpg

备注：可以在高版本容器内验证也可以在mlu容器内进行验证

模型验证（MLU）

模型量化

python3 mlu/mlu_quant.py --yaml configs/coco.yaml --weight mlu/weight/FastestDet_unzip.pt --img data/3.jpg
mv FastestDet_intx.pth mlu/weight/

模型验证

#逐层运行
python3 mlu/mlu_test.py  --yaml configs/coco.yaml --weight mlu/weight/FastestDet_intx.pth --img data/3.jpg

#融合模式运行
python3 mlu/mlu_test.py  --yaml configs/coco.yaml --weight mlu/weight/FastestDet_intx.pth --img data/3.jpg --jit

#生成270离线模型 4batch
python3 mlu/mlu_test.py  --yaml configs/coco.yaml --weight mlu/weight/FastestDet_intx.pth --img data/3.jpg --batch 4 --core 4 --mcore MLU270 --save --jit --mname mlu270_4b4c

注意：基于MLU需要运行量化后的模型。