记录BEVFormer对应的所需要的输入数据。
evaluate_trt.py
evaluate_trt.py
的推理过程的代码以及注释如下所示:
import pycuda.autoinit
import tensorrt as trt
import pycuda.driver as cuda
import argparse
import torch
import mmcv
import copy
import numpy as np
from mmcv import Config
from mmdeploy.backend.tensorrt import load_tensorrt_plugin
import sys
sys.path.append(".")
from det2trt.utils.tensorrt import (
get_logger,
create_engine_context,
allocate_buffers,
do_inference,
)
from third_party.bev_mmdet3d.models.builder import build_model
from third_party.bev_mmdet3d.datasets.builder import build_dataloader, build_dataset
def parse_args():
parser = argparse.ArgumentParser(description="MMDet test (and eval) a model")
parser.add_argument("config", default='configs/bevformer/bevformer_tiny_trt.py',help="test config file path")
parser.add_argument("trt_model",default='checkpoints/tensorrt/bevformer_tiny_epoch_24.trt', help="checkpoint file")
args = parser.parse_args()
return args
def main():
args = parse_args()
load_tensorrt_plugin()
trt_model = args.trt_model
config_file = args.config
TRT_LOGGER = get_logger(trt.Logger.INTERNAL_ERROR)
engine, context = create_engine_context(trt_model, TRT_LOGGER)
stream = cuda.Stream()
config = Config.fromfile(config_file)
if hasattr(config, "plugin"):
import importlib
import sys
sys.path.append(".")
if isinstance(config.plugin, list):
for plu in config.plugin:
importlib.import_module(plu)
else:
importlib.import_module(config.plugin)
output_shapes = config.output_shapes
# {'bev_embed': ['bev_h*bev_w', 'batch_size', 'dim'],
# 'outputs_classes': ['cameras', 'batch_size', 'num_query', 'num_classes'],
# 'outputs_coords': ['cameras', 'batch_size', 'num_query', 'code_size']}
input_shapes = config.input_shapes
# input_shapes:
# {'image': ['batch_size', 'cameras', 3, 'img_h', 'img_w'],
# 'prev_bev': ['bev_h*bev_w', 'batch_size', 'dim'],
# 'use_prev_bev': [1],
# 'can_bus': [18],
# 'lidar2img': ['batch_size', 'cameras', 4, 4]}
default_shapes = config.default_shapes
# default_shapes:
# {'batch_size': 1, 'img_h': 480, 'img_w': 800, 'bev_h': 50, 'bev_w': 50, 'dim': 256,
# 'num_query': 900, 'num_classes': 10, 'code_size': 10, 'cameras': 6}
for key in default_shapes:
if key in locals():
raise RuntimeError(f"Variable {key} has been defined.")
locals()[key] = default_shapes[key]
dataset = build_dataset(cfg=config.data.val)
loader = build_dataloader(
dataset, samples_per_gpu=1, workers_per_gpu=6, shuffle=False, dist=False
)
pth_model = build_model(config.model, test_cfg=config.get("test_cfg")) # 根据配置文件中的设置来构建一个模型实例,以便进行后续的测试或推理任务
ts = []
bbox_results = []
prog_bar = mmcv.ProgressBar(len(dataset)) # len(dataset)=81
prev_bev = np.random.randn(config.bev_h_ * config.bev_w_, 1, config._dim_) # 初始的前BEV特征是随机生成的,大小即为BEV特征的shape ['bev_h*bev_w', 'batch_size', 'dim']
prev_frame_info = { # 初始前BEV特征的信息(也就是说BEV特征是输出/需要这些信息的)
"scene_token": None,
"prev_pos": 0,
"prev_angle": 0,
}
for data in loader: #总共81
img = data["img"][0].data[0].numpy() # shape:(1, 6, 3, 480, 800)
img_metas = data["img_metas"][0].data[0]
use_prev_bev = np.array([1.0]) # 1.
if img_metas[0]["scene_token"] != prev_frame_info["scene_token"]: # 场景不一致,则不使用前BEV
use_prev_bev = np.array([0.0])
prev_frame_info["scene_token"] = img_metas[0]["scene_token"] # 把当前帧的场景token做记录为前一帧的场景token
tmp_pos = copy.deepcopy(img_metas[0]["can_bus"][:3]) # can_bus的前三个值:[ 600.12021379 1647.49077628 0. ](第一个data) 第三个值(z)一直是0
tmp_angle = copy.deepcopy(img_metas[0]["can_bus"][-1]) # can_bus最后一个值:331.2586436188191(第一个data)
if use_prev_bev[0] == 1:
img_metas[0]["can_bus"][:3] -= prev_frame_info["prev_pos"] # 计算当前帧与前一帧的位置变化差值,并赋值给can_bus
img_metas[0]["can_bus"][-1] -= prev_frame_info["prev_angle"] # 计算当前帧与前一帧的角度变化差值,并赋值给can_bus
else:
img_metas[0]["can_bus"][-1] = 0 # 定为初始 第一帧的前三个值和最后一个值被赋给了0
img_metas[0]["can_bus"][:3] = 0 # 定为初始
can_bus = img_metas[0]["can_bus"]
lidar2img = np.stack(img_metas[0]["lidar2img"], axis=0) # shape:(6,4,4)
batch_size, cameras, _, img_h, img_w = img.shape
output_shapes_ = {}
for key in output_shapes.keys():
shape = output_shapes[key][:]
for shape_i in range(len(shape)):
if isinstance(shape[shape_i], str):
shape[shape_i] = eval(shape[shape_i])
output_shapes_[key] = shape
# output_shapes_ : {'bev_embed': [2500, 1, 256], 'outputs_classes': [6, 1, 900, 10], 'outputs_coords': [6, 1, 900, 10]}
input_shapes_ = {}
for key in input_shapes.keys():
shape = input_shapes[key][:]
for shape_i in range(len(shape)):
if isinstance(shape[shape_i], str):
shape[shape_i] = eval(shape[shape_i])
input_shapes_[key] = shape
# input_shapes_:{'image': [1, 6, 3, 480, 800], 'prev_bev': [2500, 1, 256], 'use_prev_bev': [1], 'can_bus': [18], 'lidar2img': [1, 6, 4, 4]}
inputs, outputs, bindings = allocate_buffers(
engine, context, input_shapes=input_shapes_, output_shapes=output_shapes_
) # 分配内存初始化
for inp in inputs: # 对inputs进行赋值,用于推理
if inp.name == "image":
inp.host = img.reshape(-1).astype(np.float32)
elif inp.name == "prev_bev":
inp.host = prev_bev.reshape(-1).astype(np.float32)
elif inp.name == "use_prev_bev":
inp.host = use_prev_bev.reshape(-1).astype(np.float32)
elif inp.name == "can_bus":
inp.host = can_bus.reshape(-1).astype(np.float32)
elif inp.name == "lidar2img":
inp.host = lidar2img.reshape(-1).astype(np.float32)
else:
raise RuntimeError(f"Cannot find input name {inp.name}.")
trt_outputs, t = do_inference(
context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream
) # t:时间
trt_outputs = { # reshape
out.name: out.host.reshape(*output_shapes_[out.name]) for out in trt_outputs
} # trt_outputs:{'bev_embed':array...,dtype=float32,outputs_classes': array, dtypefloat32,'outputs_coords':array...,dtype=float32)}
prev_bev = trt_outputs.pop("bev_embed") # 将当前帧计算得出的bev特征作为下一帧的前一帧的bev特征
prev_frame_info["prev_pos"] = tmp_pos # 将当前帧位置信息作为下一帧的前一帧的位置信息
prev_frame_info["prev_angle"] = tmp_angle # 将当前帧方向盘转角作为下一帧的前一帧的方向盘转角
trt_outputs = {k: torch.from_numpy(v) for k, v in trt_outputs.items()} # 将输出转为Tensor
bbox_results.extend(pth_model.post_process(**trt_outputs, img_metas=img_metas)) # [{'pts_bbox': {'boxes_3d': 'scores_3d':'labels_3d': ]
ts.append(t)
for _ in range(len(img)):
prog_bar.update()
metric = dataset.evaluate(bbox_results)
# summary
print("*" * 50 + " SUMMARY " + "*" * 50)
for key in metric.keys():
if key == "pts_bbox_NuScenes/NDS":
print(f"NDS: {round(metric[key], 3)}")
elif key == "pts_bbox_NuScenes/mAP":
print(f"mAP: {round(metric[key], 3)}")
latency = round(sum(ts[1:-1]) / len(ts[1:-1]) * 1000, 2)
print(f"Latency: {latency}ms")
print(f"FPS: {1000 / latency}")
if __name__ == "__main__":
main()
第一帧的前BEV特征的产生
通过随机产生BEV特征大小的shape:[‘bev_h*bev_w’, ‘batch_size’, ‘dim’]。
prev_bev = np.random.randn(config.bev_h_ * config.bev_w_, 1, config._dim_) # 初始的前BEV特征是随机生成的,大小即为BEV特征的shape ['bev_h*bev_w', 'batch_size', 'dim']
prev_frame_info = { # 初始前BEV特征的信息(也就是说BEV特征是输出/需要这些信息的)
"scene_token": None, #场景token
"prev_pos": 0, #
"prev_angle": 0,
}
进行模型推理时需要用到的数据信息
img, prev_bev, use_prev_bev, can_bus, lidar2img
for inp in inputs: # 对inputs进行赋值,用于推理
if inp.name == "image":
inp.host = img.reshape(-1).astype(np.float32)
elif inp.name == "prev_bev":
inp.host = prev_bev.reshape(-1).astype(np.float32)
elif inp.name == "use_prev_bev":
inp.host = use_prev_bev.reshape(-1).astype(np.float32)
elif inp.name == "can_bus":
inp.host = can_bus.reshape(-1).astype(np.float32)
elif inp.name == "lidar2img":
inp.host = lidar2img.reshape(-1).astype(np.float32)
else:
raise RuntimeError(f"Cannot find input name {inp.name}.")