前言:花了一天时间看论文和代码理解,一晚上时间进行ONNX模型部署,终于还是成功啦,记录下部署过程,分享心得。
题目要求:学习了解最新深度估计模型DepthAnything,根据上篇OnnxRuntime----Lite-Mono单目深度估计ONNX推理,实现深度估计模型DepthAnything推理,并集成到现有ONNX系列模型中。
论文:Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data
源码:Depth-Anything GitHub
参考代码:Depth-Anything-ONNX
分析:
1)了解DepthAnything的基本原理和代码理解
2)将模型转化为更加方便高效的ONNX模型并在OnnxRuntime中完成推理过程(并验证)
-
真实场景测试结果展示(已对Pymodel和Cmodel输出disparity验证):
-
Pytorch转ONNX模型 (输出固定尺寸518x518推理测试结果)
import argparse
import torch
from onnx import load_model, save_model
from onnxruntime.tools.symbolic_shape_infer import SymbolicShapeInference
from depth_anything.dpt import DPT_DINOv2
from depth_anything.util.transform import load_image
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser()
parser.add_argument(
"--model",
type=str,
choices=["s", "b", "l"],
required=True,
help="Model size variant. Available options: 's', 'b', 'l'.",
)
parser.add_argument(
"--output",
type=str,
default=None,
required=False,
help="Path to save the ONNX model.",
)
return parser.parse_args()
def export_onnx(model: str, output: str = None):
# Handle args
if output is None:
output = f"weights/depth_anything_vit{model}14_ori.onnx"
# Device for tracing (use whichever has enough free memory)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
# Sample image for tracing (dimensions don't matter)
# image, _ = load_image("assets/sacre_coeur1.jpg")
image = torch.rand(1,3,518,518).to(device)
# image = torch.from_numpy(image).to(device)
# Load model params
if model == "s":
depth_anything = DPT_DINOv2(
encoder="vits", features=64, out_channels=[48, 96, 192, 384]
)
elif model == "b":
depth_anything = DPT_DINOv2(
encoder="vitb", features=128, out_channels=[96, 192, 384, 768]
)
else: # model == "l"
depth_anything = DPT_DINOv2(
encoder="vitl", features=256, out_channels=[256, 512, 1024, 1024]
)
depth_anything.to(device).load_state_dict(
torch.hub.load_state_dict_from_url(
f"https://hf-mirror.com/spaces/LiheYoung/Depth-Anything/resolve/main/checkpoints/depth_anything_vit{model}14.pth",
map_location="cpu",
),
strict=True,
)
depth_anything.eval()
torch.onnx.export(
depth_anything,
image,
output,
input_names=["image"],
output_names=["depth"],
opset_version=17,
# dynamic_axes={
# "image": {2: "height", 3: "width"},
# "depth": {2: "height", 3: "width"},
# },
)
save_model(
SymbolicShapeInference.infer_shapes(load_model(output), auto_merge=True),
output,
)
if __name__ == "__main__":
args = parse_args()
export_onnx(**vars(args))
# python export.py --model s
- OnnxRuntime Cmodel:
#include <assert.h>
#include <vector>
#include <ctime>
#include <iostream>
#include <onnxruntime_cxx_api.h>
#include <opencv2/core.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/videoio.hpp>
#include "utils.cpp"
using std::cout;
using std::endl;
using namespace cv;
using namespace std;
class depthAnything
{
public:
depthAnything(const wchar_t* onnx_model_path);
std::vector<float> predict(std::vector<float>& input_data, int batch_size = 1, int index = 0);
cv::Mat predict(cv::Mat& input_tensor, int batch_size = 1, int index = 0);
private:
Ort::Env env;
Ort::Session session;
Ort::AllocatorWithDefaultOptions allocator;
std::vector<const char*>input_node_names = {"image"};
std::vector<const char*>output_node_names = {"depth"};
std::vector<int64_t> input_node_dims;
std::vector<int64_t> output_node_dims;
};
depthAnything::depthAnything(const wchar_t* onnx_model_path) :session(nullptr), env(nullptr)
{
// init env
this->env = Ort::Env(ORT_LOGGING_LEVEL_WARNING, "depthAnything_mono");
// init session options
Ort::SessionOptions session_options;
// session_options.SetInterOpNumThreads(1);
// session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
// create session and load to memory
this->session = Ort::Session(env, onnx_model_path, session_options);
//输入输出节点数量和名称
size_t num_input_nodes = session.GetInputCount();
size_t num_output_nodes = session.GetOutputCount();
for (int i = 0; i < num_input_nodes; i++)
{
Ort::TypeInfo type_info = session.GetInputTypeInfo(i);
auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
ONNXTensorElementDataType type = tensor_info.GetElementType();
this->input_node_dims = tensor_info.GetShape();
}
for (int i = 0; i < num_output_nodes; i++)
{
Ort::TypeInfo type_info = session.GetOutputTypeInfo(i);
auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
ONNXTensorElementDataType type = tensor_info.GetElementType();
this->output_node_dims = tensor_info.GetShape();
}
}
std::vector<float> depthAnything::predict(std::vector<float>& input_tensor_values, int batch_size, int index)
{
this->input_node_dims[0] = batch_size;
this->output_node_dims[0] = batch_size;
float* floatarr = nullptr;
try
{
std::vector<const char*>output_node_names;
if (index != -1)
{
output_node_names = { this->output_node_names[index] };
}
else
{
output_node_names = this->output_node_names;
}
this->input_node_dims[0] = batch_size;
auto input_tensor_size = input_tensor_values.size();
auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
Ort::Value input_tensor = Ort::Value::CreateTensor<float>(memory_info, input_tensor_values.data(), input_tensor_size, input_node_dims.data(), 4);
auto output_tensors = session.Run(Ort::RunOptions{ nullptr }, input_node_names.data(), &input_tensor, 1, output_node_names.data(), 1);
assert(output_tensors.size() == 1 && output_tensors.front().IsTensor());
floatarr = output_tensors[0].GetTensorMutableData<float>();
}
catch (Ort::Exception& e)
{
throw e;
}
int64_t output_tensor_size = 1;
for (auto& it : this->output_node_dims)
{
output_tensor_size *= it;
}
std::vector<float>results(output_tensor_size);
for (unsigned i = 0; i < output_tensor_size; i++)
{
results[i] = floatarr[i];
}
return results;
}
cv::Mat depthAnything::predict(cv::Mat& input_tensor, int batch_size, int index)
{
int input_tensor_size = input_tensor.cols * input_tensor.rows * 3;
std::size_t counter = 0;
std::vector<float>input_data(input_tensor_size);
std::vector<float>output_data;
try
{
for (unsigned k = 0; k < 3; k++)
{
for (unsigned i = 0; i < input_tensor.rows; i++)
{
for (unsigned j = 0; j < input_tensor.cols; j++)
{
input_data[counter++] = static_cast<float>(input_tensor.at<cv::Vec3b>(i, j)[k]) / 255.0;
}
}
}
}
catch (cv::Exception& e)
{
printf(e.what());
}
try
{
output_data = this->predict(input_data);
}
catch (Ort::Exception& e)
{
throw e;
}
cv::Mat output_tensor(output_data);
output_tensor =output_tensor.reshape(1, {518, 518});
double minVal, maxVal;
cv::minMaxLoc(output_tensor, &minVal, &maxVal);
output_tensor.convertTo(output_tensor, CV_32F);
if (minVal != maxVal) {
output_tensor = (output_tensor - minVal) / (maxVal - minVal);
}
output_tensor *= 255.0;
output_tensor.convertTo(output_tensor, CV_8UC1);
cv::applyColorMap(output_tensor, output_tensor, cv::COLORMAP_JET);
return output_tensor;
}
int main(int argc, char* argv[])
{
// const wchar_t* model_path = L"model/simvit.onnx";
// depthAnything model(model_path);
// cv::Mat image = cv::imread("inference/DSC_0410.jpg");
// auto ori_h = image.cols;
// auto ori_w = image.rows;
// cv::imshow("image", image);
// cv::cvtColor(image, image, cv::COLOR_BGR2RGB);
// cv::resize(image, image, {518, 518}, 0.0, 0.0, cv::INTER_CUBIC);
// auto result = model.predict(image);
// cv::resize(result, result, {ori_h, ori_w}, 0.0, 0.0, cv::INTER_CUBIC);
// cv::imwrite("da.png",result);
// // cv::imshow("result", result);
// cv::waitKey(0);
// cv::destroyAllWindows();
const wchar_t* model_path = L"model/simvit.onnx";
depthAnything model(model_path);
cv::Mat image = cv::imread("inference/classroom.jpg");
auto ori_h = image.cols;
auto ori_w = image.rows;
string kWinName = "Deep learning depth estimation DepthAnything in OpenCV";
VideoCapture capture(1);
Mat frame, temp;
while(true){
capture >> frame;
ori_h = frame.cols;
ori_w = frame.rows;
resize(frame, temp, Size(518, 518), INTER_LINEAR);
cv::cvtColor(temp, temp, cv::COLOR_BGR2RGB);
Mat depthMap = model.predict(temp);
cv::resize(depthMap, depthMap, {ori_h, ori_w}, 0.0, 0.0, cv::INTER_CUBIC);
Mat res = viewer({frame, depthMap});
if(waitKey(10) == 'q'){
capture.release();
break;
}
imshow(kWinName, res);
}
}
- 小结
模型整合和框架迁移:
1)作为最新的SOTA,DepthAnything在真实场景下的depth估计效果也很好,测试多数场景,视频都表现很不错,处理速度也不慢,模型大小本身也不是太大,有望后续在android端进行部署demo;
2)注意ONNX导出过程以及结果验证,由于DepthAnything的python代码对depth做了预处理和后处理,需要逐步验证测试,最终比较输出结果是否一致;
3)DepthAnything的出现,为一些其他应用提供了可能,比如人像Bokeh(手机摄影方向),距离测量(自动驾驶方向)等,后续可能进一步做延伸和拓展;
4)再次被开源的力量震撼,Transformer已然成为现在时,更可能是未来式,模型轻量化,效果惊艳化,多多关注前言发展,扩展视野。