【寒武纪mlu220模型移植】（一）目标检测YoLoV8

自由丶如风

已于 2024-07-16 17:10:01 修改

阅读量328

点赞数 2

分类专栏：寒武纪系列文章标签： c++

于 2024-07-12 10:35:47 首次发布

本文链接：https://blog.csdn.net/qq_41384531/article/details/140372504

版权

寒武纪系列专栏收录该内容

5 篇文章 0 订阅

订阅专栏

前言：目标检测可能是大家用的比较多的，先完善这一篇吧。yolov5的导出可以参考官方文档。并且博主比较懒，已经做过一遍的事情，不想验证第二遍，如果有步骤错误或者疏漏导致中间遇到了问题，可以先自己debug，流程大致就是这样的。

一、修改源码

首先是拉取yolov8最新的源码，最新的commit即可，任意下载一个型号的模型

git clone https://github.com/ultralytics/ultralytics.git
wget https://github.com/ultralytics/assets/releases/download/v8.2.0/yolov8n.pt

由于mlu220最高支持pytorch1.3版本，因此yolov8的一些库是不支持的，因此需要在源码的基础上对这些进行删减。
并且pytorch1.3不支持加载pytorch1.4~版本之后默认使用torch.save保存的模型。需要加载模型后重新保存。

model = YOLO('yolov8n.yaml').load('yolov8n.pt')
torch.save(model.state_dict(), "yolov8n_unzip.pt", _use_new_zipfile_serialization=False)

（1）不支持GELU

修改 /torch/venv3/pytorch/lib/python3.6/site-packages/torch/nn/modules/activation.py 文件

class GELU(Module):
    @staticmethod
    def forward(x):
        return x * 0.5 * (1 + torch.tanh(torch.sqrt(2 / torch.pi) * (x + 0.044715 * torch.pow(x, 3))))

修改 /torch/venv3/pytorch/lib/python3.6/site-packages/torch/nn/modules/init.py 我呢见

from .activation import Threshold, ReLU, ..., GELU
__all__ = [
    'SiLU', 'Hardswish', "GELU", 'Module', ...]

（2）ModuleNotFoundError: No module named ‘importlib.metadata’

ultralytics\utils/__init__.py 中 importlib.metadata 替换为 importlib_metadata
ultralytics\utils/checks.py 中 from importlib import metadata 替换为 import importlib_metadata
ultralytics\utils/checks.py 中 metadata 替换为 importlib_metadata

（3）将 ultralytics/hub 修改为 ultralytics/hub_bak

ultralytics\engine\model.py 注释 from ultralytics.hub.utils import HUB_WEB_ROOT
ultralytics\engine\model.py 注释 HUB_WEB_ROOT相关函数调用

（4）建议直接使用 DetectionModel 类创建模型，分割、分类等同理

from ultralytics.nn.tasks import yaml_model_load, DetectionModel
model = DetectionModel(cfg=yaml_model_load("yolov8n.yaml"))
ckpt = torch.load("./weights/yolov8n_unzip.pt", map_location="cpu")
model.load_state_dict(ckpt["model"].state_dict())

（5）后处理层不支持

修改 ultralytics/ultralytics/nn/modules/head.py 文件

找到 def _inference(self, x): 函数
修改为以下内容
        # if self.dynamic or self.shape != shape:
        #     self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
        #     self.shape = shape

        box = x_cat[:, : self.reg_max * 4]
        cls = x_cat[:, self.reg_max * 4 :]
        return self.dfl(box), cls.sigmoid()

（6）寒武纪在量化和导出cambricon模型时，需要将算子层搬运到不同的设备符上

如何理解这段话，例如，量化时，用的是torch.nn.Conv2d算子；导出时，用的是torch_mlu.nn.Conv2d算子。虽然在构建整个模型时，会使用 model.to(devices) 的操作，但是内部的有些算子还是需要手动搬运，因此需要修改文件，增加手动搬运的代码。
修改 ultralytics/ultralytics/nn/modules/block.py 文件

在开头位置添加

devices = 'cpu'

在 return self.cv3(self.cv2(self.upsample(self.cv1(x)))) 前增加

 def forward(self, x):
	self.upsample = self.upsample.to(devices)
    """Performs a forward pass through layers using an upsampled input image."""
    return self.cv3(self.cv2(self.upsample(self.cv1(x))))

二、导出代码

import torch
import argparse

import torch_mlu
import torch_mlu.core.mlu_quantize as mlu_quantize
import torch_mlu.core.mlu_model as ct

from ultralytics.nn.tasks import yaml_model_load, DetectionModel

import os
import cv2
import numpy as np

img_data = []
def load_data(file, img_size):
    global img_data
    if os.path.isfile(file):
        _, suffix = os.path.splitext(file)
        if suffix in [".jpg", ".jepg", ".bmp", ".png"]:
            img = cv2.imread(file)
            if img.shape[1] != img_size[1] or img.shape[0] != img_size[0]:
                img = cv2.resize(img, (img_size[0], img_size[1]))

            img = img.astype(np.float32)
            img = img / 255.0
            img = img.transpose((2, 0, 1))
            image = img[np.newaxis, :, :, :]
            image = np.array(image, dtype=np.float32)
            img_data.append(image)
    else:
        for f in os.listdir(file):
            load_data(os.path.join(file, f), img_size)

def model_qua():
    model = DetectionModel(cfg=yaml_model_load("yolov8s.yaml"))
    ckpt = torch.load("./weights/yolov8s_unzip.pt", map_location="cpu")
    model.load_state_dict(ckpt["model"].state_dict())

    qconfig = {
        'data_scale': 1.0,
        'perchannel': False,
        'use_avg': False
    }
    quantized_model = mlu_quantize.quantize_dynamic_mlu(model, qconfig, dtype='int16', gen_quant=True)

    for img in img_data:
        img = torch.from_numpy(img).to("cpu")
        pred_1 = quantized_model(img)
    torch.save(quantized_model.state_dict(), r'./weights/yolov8s_unzip_int16.pt')
    print('run qua')

def convert_mlu():
    if opt.fake_device:
        ct.set_device(-1)
        ct.set_core_number(0)
        ct.set_core_version('MLU220')

    model = DetectionModel(cfg=yaml_model_load("yolov8s.yaml"))
    # print(model)

    quantized_net = torch_mlu.core.mlu_quantize.quantize_dynamic_mlu(model)
    state_dict = torch.load(r'./weights/yolov8s_unzip_int16.pt')
    quantized_net.load_state_dict(state_dict, strict=False)
    quantized_net.eval()
    quantized_net.to(ct.mlu_device())

    if opt.jit:
        print("### jit")
        ct.save_as_cambricon('resnet18_torch1.3')
        torch.set_grad_enabled(False)
        ct.set_core_number(4)
    trace_input = torch.randn((1, 3, 640, 640), dtype=torch.float)
    input_mlu_data = trace_input.type(torch.HalfTensor).to(ct.mlu_device())
    quantized_net = torch.jit.trace(quantized_net, input_mlu_data, check_trace = False)

    with torch.no_grad():
        for img in img_data:
            img = torch.from_numpy(img).type(torch.HalfTensor).to(ct.mlu_device())
            pred = quantized_net(img)
            print('run mlu')
            return


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--weights', nargs='+', type=str, default='yolov5s.pt', help='model.pt path(s)')
    parser.add_argument('--source', type=str, default='../data/2', help='source')  # file/folder, 0 for webcam
    parser.add_argument('--img-size', type=int, default=320, help='inference size (pixels)')
    parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
    parser.add_argument('--cfg', default='cpu', help='qua and off')
    parser.add_argument('--fake_device', type=bool, default=True)
    parser.add_argument('--jit', type=bool, default=True)
    opt = parser.parse_args()

    load_data(opt.source, img_size=[640, 640])
    print("img_data=", len(img_data))

    # compare_output()

    with torch.no_grad():
        if opt.cfg == "cpu":
            convert_torch_v_1_3(opt.weights)
        elif opt.cfg == "qua":
            model_qua()
        elif opt.cfg == "mlu":
            convert_mlu()

需要注意的地方，量化时 devices = ‘cpu’，导出时 devices = ‘mlu’

python export.py --cfg qua
python export.py --cfg mlu

三、后处理

需要注意的地方，导出的这个yolov8是没有后处理模块的，如下图所示：
yolov8
当然，这个问题也很多解决，无非就是一些sub、add、div的操作，使用cpu实现就好了，以下是整个yolov8后处理c++实现代码：

#ifndef _YOLOV8_H
#define _YOLOV8_H

#include "XRModelAPI.h"
#include "algorithm_struct_base.h"

class YoLoV8 : public ModelAPI
{
public:
    YoLoV8(MODEL_INFO_S& info);
    virtual ~YoLoV8() {}

    ZR_ErrorCode PostProcess(std::vector<ObjectResult>& results);

public:
    void getWidth() {this->model_width = modelWidth_;}
    void getHeight() {this->model_height = modelHeight_;}

    uint32_t model_width;
    uint32_t model_height;

protected:
    ZR_ErrorCode get_anchor_centers(uint8_t stride);
    void get_bbox_key(uint8_t stride, float* detection, float* confi, std::vector<ObjectResult>& results);

private:
    std::map<uint8_t, std::vector<POINT_2D_S>> centers_points;
    uint8_t num_class = 80;
    uint8_t stride[3] = {8, 16, 32};
    // uint8_t stride[3] = {32, 16, 8};
    float scoreThreshold = 0.10f;
    float nmsThreshold = 0.35f;
    uint8_t downsample = 3;
};

#endif  // _YOLOV8_H

#include "yolov8.h"

YoLoV8::YoLoV8(MODEL_INFO_S& info) : ModelAPI(info)
{
    for (int i = 0; i < this->downsample; ++i) {
        this->get_anchor_centers(stride[i]);
    }
}

namespace {
    static char* class_lables[80] = {(char*)"person", (char*)"bicycle", (char*)"car", (char*)"motorcycle",
            (char*)"airplane", (char*)"bus", (char*)"train", (char*)"truck", (char*)"boat", (char*)"traffic light",
            (char*)"fire hydrant", (char*)"stop sign", (char*)"parking meter", (char*)"bench", (char*)"bird",
            (char*)"cat", (char*)"dog", (char*)"horse", (char*)"sheep", (char*)"cow", (char*)"elephant",
            (char*)"bear", (char*)"zebra", (char*)"giraffe", (char*)"backpack", (char*)"umbrella", (char*)"handbag",
            (char*)"tie", (char*)"suitcase", (char*)"frisbee", (char*)"skis", (char*)"snowboard",
            (char*)"sports ball", (char*)"kite", (char*)"baseball bat", (char*)"baseball glove",
            (char*)"skateboard", (char*)"surfboard", (char*)"tennis racket", (char*)"bottle", (char*)"wine glass",
            (char*)"cup", (char*)"fork", (char*)"knife", (char*)"spoon", (char*)"bowl", (char*)"banana",
            (char*)"apple", (char*)"sandwich", (char*)"orange", (char*)"broccoli", (char*)"carrot",
            (char*)"hot dog", (char*)"pizza", (char*)"donut", (char*)"cake", (char*)"chair", (char*)"couch",
            (char*)"potted plant", (char*)"bed", (char*)"dining table", (char*)"toilet", (char*)"tv/monitor",
            (char*)"laptop", (char*)"mouse", (char*)"remote", (char*)"keyboard", (char*)"cell phone",
            (char*)"microwave", (char*)"oven", (char*)"toaster", (char*)"sink", (char*)"refrigerator",
            (char*)"book", (char*)"clock", (char*)"vase", (char*)"scissors", (char*)"teddy bear",
            (char*)"hair drier", (char*)"toothbrush"};
};

ZR_ErrorCode YoLoV8::get_anchor_centers(uint8_t stride)
{
    int net_grid_w = modelWidth_ / stride;
    int net_grid_h = modelHeight_ / stride;
    
    for (int i = 0; i < net_grid_h; ++i) {
        for (int j = 0; j < net_grid_w; ++j) {
            POINT_2D_S yolox_center;
            yolox_center.cx = j + 0.5;
            yolox_center.cy = i + 0.5;
            centers_points[stride].push_back(yolox_center);
        }
    }

    return ZR_AI_OK;
}

ZR_ErrorCode YoLoV8::PostProcess(std::vector<ObjectResult>& results)
{
    uint32_t dataSize = 0;
#if defined(BUILD_WITH_CAMBIRCON)
    float* detection = (float* )model->GetInferenceOutputItem(0, dataSize);
    spdlog::debug("dataSize={} ", dataSize);
    float* confi = (float* )model->GetInferenceOutputItem(1, dataSize);
    spdlog::debug("dataSize={} ", dataSize);
#else
    float* detection = (float* )model->GetInferenceOutputItem(1, dataSize);
    spdlog::debug("dataSize={} ", dataSize);
    float* confi = (float* )model->GetInferenceOutputItem(0, dataSize);
    spdlog::debug("dataSize={} ", dataSize);
#endif

    int net_grid_w = 0, net_grid_h = 0;
    std::vector<ObjectResult> object;
    for (int i = 0; i < this->downsample; ++i) {
        if (detection != nullptr && confi != nullptr) {
            detection += net_grid_w * net_grid_h * 4;
            confi += net_grid_w * net_grid_h * num_class;
            this->get_bbox_key(stride[i], detection, confi, object);

            net_grid_w = modelWidth_ / stride[i];
            net_grid_h = modelHeight_ / stride[i];
        } else {
        }
    }

    if (object.size() != 0) {
        Utils::nms(object, results, nmsThreshold);
        object.clear();
    }
}

void YoLoV8::get_bbox_key(uint8_t stride, float* detection, float* confi, std::vector<ObjectResult>& results)
{
    int net_grid_w = modelWidth_ / stride;
    int net_grid_h = modelHeight_ / stride;

    std::vector<POINT_2D_S> all_points = centers_points[stride];
    for (int i = 0; i < net_grid_w * net_grid_h; ++i) {
        const float* temp_d = detection + i * 4;
        const float* temp_c = confi + i * num_class;

        uint8_t topClass = 0;
        for (uint8_t j = 0; j < num_class; ++j) {
            if (temp_c[j] > temp_c[topClass]) {
                topClass = j;
            }
        }

        if (temp_c[topClass] < scoreThreshold) continue;

        // 与原始网络解耦层对应
        float sub_x = all_points[i].cx - temp_d[0];
        float sub_y = all_points[i].cy - temp_d[1];
        float add_w = temp_d[2] + all_points[i].cx;
        float add_h = temp_d[3] + all_points[i].cy;

        float x_center = ((sub_x + add_w) * 0.5f) * stride;
        float y_center = ((sub_y + add_h) * 0.5f) * stride;
        float w = (add_w - sub_x) * stride;
        float h = (add_h - sub_y) * stride;

        if (w < 5.f || h < 5.f) continue;

        ObjectResult obj;
        obj.confidence = temp_c[topClass];
        obj.name = class_lables[topClass];
        obj.class_index = topClass;
        obj.bbox.x_min = std::max((x_center - w * 0.5f), 0.f);
        obj.bbox.y_min = std::max((y_center - h * 0.5f), 0.f);
        obj.bbox.x_max = std::min((x_center + w * 0.5f), float(modelWidth_));
        obj.bbox.y_max = std::min((y_center + h * 0.5f), float(modelHeight_));

        results.push_back(obj);
    }
}