OnnxRuntime----DepthAnything深度估计ONNX推理

最新推荐文章于 2024-04-30 17:16:50 发布

qq_37172182

最新推荐文章于 2024-04-30 17:16:50 发布

阅读量682

点赞数 5

分类专栏： opencv C++ 文章标签： Depth Anything depth onnxruntime c++

本文链接：https://blog.csdn.net/qq_37172182/article/details/137578511

版权

C++ 同时被 2 个专栏收录

69 篇文章 4 订阅

订阅专栏

opencv

12 篇文章 3 订阅

订阅专栏

文章讲述了作者将DepthAnything模型部署到ONNX并使用OnnxRuntime进行推理的过程，以及在OpenCV环境中的实际应用和注意事项。

摘要由CSDN通过智能技术生成

前言：花了一天时间看论文和代码理解，一晚上时间进行ONNX模型部署，终于还是成功啦，记录下部署过程，分享心得。
题目要求：学习了解最新深度估计模型DepthAnything，根据上篇OnnxRuntime----Lite-Mono单目深度估计ONNX推理，实现深度估计模型DepthAnything推理，并集成到现有ONNX系列模型中。
论文：Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data
源码：Depth-Anything GitHub
参考代码：Depth-Anything-ONNX

分析：
1）了解DepthAnything的基本原理和代码理解
2）将模型转化为更加方便高效的ONNX模型并在OnnxRuntime中完成推理过程（并验证）

真实场景测试结果展示(已对Pymodel和Cmodel输出disparity验证)：
Pytorch转ONNX模型 (输出固定尺寸518x518推理测试结果)

import argparse

import torch
from onnx import load_model, save_model
from onnxruntime.tools.symbolic_shape_infer import SymbolicShapeInference

from depth_anything.dpt import DPT_DINOv2
from depth_anything.util.transform import load_image


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--model",
        type=str,
        choices=["s", "b", "l"],
        required=True,
        help="Model size variant. Available options: 's', 'b', 'l'.",
    )
    parser.add_argument(
        "--output",
        type=str,
        default=None,
        required=False,
        help="Path to save the ONNX model.",
    )

    return parser.parse_args()


def export_onnx(model: str, output: str = None):
    # Handle args
    if output is None:
        output = f"weights/depth_anything_vit{model}14_ori.onnx"

    # Device for tracing (use whichever has enough free memory)
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device = torch.device("cpu")

    # Sample image for tracing (dimensions don't matter)
    # image, _ = load_image("assets/sacre_coeur1.jpg")
    image = torch.rand(1,3,518,518).to(device)
    # image = torch.from_numpy(image).to(device)

    # Load model params
    if model == "s":
        depth_anything = DPT_DINOv2(
            encoder="vits", features=64, out_channels=[48, 96, 192, 384]
        )
    elif model == "b":
        depth_anything = DPT_DINOv2(
            encoder="vitb", features=128, out_channels=[96, 192, 384, 768]
        )
    else:  # model == "l"
        depth_anything = DPT_DINOv2(
            encoder="vitl", features=256, out_channels=[256, 512, 1024, 1024]
        )

    depth_anything.to(device).load_state_dict(
        torch.hub.load_state_dict_from_url(
            f"https://hf-mirror.com/spaces/LiheYoung/Depth-Anything/resolve/main/checkpoints/depth_anything_vit{model}14.pth",
            map_location="cpu",
        ),
        strict=True,
    )
    depth_anything.eval()

    torch.onnx.export(
        depth_anything,
        image,
        output,
        input_names=["image"],
        output_names=["depth"],
        opset_version=17,
        # dynamic_axes={
        #     "image": {2: "height", 3: "width"},
        #     "depth": {2: "height", 3: "width"},
        # },
    )

    save_model(
        SymbolicShapeInference.infer_shapes(load_model(output), auto_merge=True),
        output,
    )


if __name__ == "__main__":
    args = parse_args()
    export_onnx(**vars(args))
    # python export.py --model s

OnnxRuntime Cmodel：

#include <assert.h>
#include <vector>
#include <ctime>
#include <iostream>
#include <onnxruntime_cxx_api.h>
#include <opencv2/core.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/videoio.hpp>
#include "utils.cpp"
using std::cout;
using std::endl;

using namespace cv;
using namespace std;

class depthAnything
{
public:
    depthAnything(const wchar_t* onnx_model_path);
    std::vector<float> predict(std::vector<float>& input_data, int batch_size = 1, int index = 0);
    cv::Mat predict(cv::Mat& input_tensor, int batch_size = 1, int index = 0);
private:
    Ort::Env env;
    Ort::Session session;
    Ort::AllocatorWithDefaultOptions allocator;
    std::vector<const char*>input_node_names = {"image"};
    std::vector<const char*>output_node_names = {"depth"};
    std::vector<int64_t> input_node_dims;
    std::vector<int64_t> output_node_dims;
};
depthAnything::depthAnything(const wchar_t* onnx_model_path) :session(nullptr), env(nullptr)
{
    // init env
    this->env = Ort::Env(ORT_LOGGING_LEVEL_WARNING, "depthAnything_mono");
    // init session options
    Ort::SessionOptions session_options;
    // session_options.SetInterOpNumThreads(1);
    // session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
    // create session and load to memory
    this->session = Ort::Session(env, onnx_model_path, session_options);
    //输入输出节点数量和名称
    size_t num_input_nodes = session.GetInputCount();
    size_t num_output_nodes = session.GetOutputCount();
    for (int i = 0; i < num_input_nodes; i++)
    {
        Ort::TypeInfo type_info = session.GetInputTypeInfo(i);
		auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
		ONNXTensorElementDataType type = tensor_info.GetElementType();
        this->input_node_dims = tensor_info.GetShape();
    }
    for (int i = 0; i < num_output_nodes; i++)
    {
        Ort::TypeInfo type_info = session.GetOutputTypeInfo(i);
		auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
		ONNXTensorElementDataType type = tensor_info.GetElementType();
        this->output_node_dims = tensor_info.GetShape();
    }
}

std::vector<float> depthAnything::predict(std::vector<float>& input_tensor_values, int batch_size, int index)
{
    this->input_node_dims[0] = batch_size;
    this->output_node_dims[0] = batch_size;
    float* floatarr = nullptr;
    try
    {
        std::vector<const char*>output_node_names;
        if (index != -1)
        {
            output_node_names = { this->output_node_names[index] };
        }
        else
        {
            output_node_names = this->output_node_names;
        }
        this->input_node_dims[0] = batch_size;
        auto input_tensor_size = input_tensor_values.size();
        auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
        Ort::Value input_tensor = Ort::Value::CreateTensor<float>(memory_info, input_tensor_values.data(), input_tensor_size, input_node_dims.data(), 4);
        auto output_tensors = session.Run(Ort::RunOptions{ nullptr }, input_node_names.data(), &input_tensor, 1, output_node_names.data(), 1);
        assert(output_tensors.size() == 1 && output_tensors.front().IsTensor());
        floatarr = output_tensors[0].GetTensorMutableData<float>();
    }
    catch (Ort::Exception& e)
    {
        throw e;
    }
    int64_t output_tensor_size = 1;
    for (auto& it : this->output_node_dims)
    {
        output_tensor_size *= it;
    }
    std::vector<float>results(output_tensor_size);
    for (unsigned i = 0; i < output_tensor_size; i++)
    {
        results[i] = floatarr[i];
    }
    return results;
}
cv::Mat depthAnything::predict(cv::Mat& input_tensor, int batch_size, int index)
{
    int input_tensor_size = input_tensor.cols * input_tensor.rows * 3;
    std::size_t counter = 0;
    std::vector<float>input_data(input_tensor_size);
    std::vector<float>output_data;
    try
    {
        for (unsigned k = 0; k < 3; k++)
        {
            for (unsigned i = 0; i < input_tensor.rows; i++)
            {
                for (unsigned j = 0; j < input_tensor.cols; j++)
                {
                    input_data[counter++] = static_cast<float>(input_tensor.at<cv::Vec3b>(i, j)[k]) / 255.0;
                }
            }
        }
    }
    catch (cv::Exception& e)
    {
        printf(e.what());
    }
    try
    {
        output_data = this->predict(input_data);
    }
    catch (Ort::Exception& e)
    {
        throw e;
    }
    cv::Mat output_tensor(output_data);
    output_tensor =output_tensor.reshape(1, {518, 518});
    double minVal, maxVal;
    cv::minMaxLoc(output_tensor, &minVal, &maxVal);
    output_tensor.convertTo(output_tensor, CV_32F);
    if (minVal != maxVal) {
        output_tensor = (output_tensor - minVal) / (maxVal - minVal);
        
    }
    output_tensor *= 255.0;
    output_tensor.convertTo(output_tensor, CV_8UC1);
    cv::applyColorMap(output_tensor, output_tensor, cv::COLORMAP_JET);
    return output_tensor;
}
int main(int argc, char* argv[])
{
    // const wchar_t* model_path = L"model/simvit.onnx";
    // depthAnything model(model_path);
    // cv::Mat image = cv::imread("inference/DSC_0410.jpg");
    // auto ori_h = image.cols;
    // auto ori_w = image.rows;
    // cv::imshow("image", image);
    // cv::cvtColor(image, image, cv::COLOR_BGR2RGB);
    // cv::resize(image, image, {518, 518}, 0.0, 0.0, cv::INTER_CUBIC);
    // auto result = model.predict(image);
    // cv::resize(result, result, {ori_h, ori_w}, 0.0, 0.0, cv::INTER_CUBIC);
    // cv::imwrite("da.png",result);
    // // cv::imshow("result", result);
    // cv::waitKey(0);
    // cv::destroyAllWindows();

    const wchar_t* model_path = L"model/simvit.onnx";
    depthAnything model(model_path);
    cv::Mat image = cv::imread("inference/classroom.jpg");
    auto ori_h = image.cols;
    auto ori_w = image.rows;
    string kWinName = "Deep learning depth estimation DepthAnything in OpenCV";
    VideoCapture capture(1);
    Mat frame, temp;
    while(true){
    	capture >> frame;
    	ori_h = frame.cols;
    	ori_w = frame.rows;
    	resize(frame, temp, Size(518, 518), INTER_LINEAR);
    	cv::cvtColor(temp, temp, cv::COLOR_BGR2RGB);
    	Mat depthMap = model.predict(temp);
    	cv::resize(depthMap, depthMap, {ori_h, ori_w}, 0.0, 0.0, cv::INTER_CUBIC);
    	Mat res = viewer({frame, depthMap});
    	if(waitKey(10) == 'q'){
    		capture.release();
    		break;
    	}
    	imshow(kWinName, res);
    }
}

小结
模型整合和框架迁移：
1）作为最新的SOTA，DepthAnything在真实场景下的depth估计效果也很好，测试多数场景，视频都表现很不错，处理速度也不慢，模型大小本身也不是太大，有望后续在android端进行部署demo;
2）注意ONNX导出过程以及结果验证，由于DepthAnything的python代码对depth做了预处理和后处理，需要逐步验证测试，最终比较输出结果是否一致；
3）DepthAnything的出现，为一些其他应用提供了可能，比如人像Bokeh（手机摄影方向），距离测量（自动驾驶方向）等，后续可能进一步做延伸和拓展；
4）再次被开源的力量震撼，Transformer已然成为现在时，更可能是未来式，模型轻量化，效果惊艳化，多多关注前言发展，扩展视野。