Confirm that you have installed the environment needed:
My env: torch2.0.0 torchvision0.15.1 tensorrt8.5.2 CUDA:11.4 onnxruntime_gpu 1.12.1
First,prepare your model and datasets:
device = torch.device('cuda')
model = ResNet18().eval().cuda()
check_point = torch.load('models/model-epoch109.pt', map_location=device)
model.load_state_dict(check_point)
print('ok')
test_dataset = torchvision.datasets.CIFAR10(
root='./data',
train=False,
download=True,
transform=transforms.Compose([
transforms.ToTensor(),
])
)
batch=100
testloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch, shuffle=False, num_workers=2)
Next, convert your pytorch model to ONNX model:
def to_onnx(model):
dummy_input = torch.randn(1,3,32,32) # input sample
input_names=['input']
output_names=['output']
dynamic_axes = {'input':{0:"batch_size"},'output':{0:"batch_size"}}
torch.onnx.export(model, dummy_input, 'res18Z.onnx', export_params=True, opset_version=11, do_constant_folding=True,input_names=input_names,output_names=output_names,dynamic_axes=dynamic_axes)
Then, use trtexec to convert the onnx model to TensorRT engine file:
you should jump to the path of trtexec:
cd /usr/src/tensorrt/bin
Then execute the following command:
./trtexec --onnx=your_onnx_file_path/res18Z.onnx --saveEngine=your_save_path/res18Z.trt --workspace=4096 --shapes=input:100x3x32x32
ATTENTION: the --shape must follow your input strictly(eg. B,C,H,W),and shouldn’t be ignored.
Finally,we just use the trt file to run tensorrt file:
def load_engine(trt_runtime, eng_path):
with open(eng_path, 'rb') as f:
eng_data = f.read()
return trt_runtime.deserialize_cuda_engine(eng_data)
def test_trt():
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
trt_runtime = trt.Runtime(TRT_LOGGER)
engine = load_engine(trt_runtime, 'res18Z.trt')
context = engine.create_execution_context()
# allocate buffer of in and output
input_shape = (batch, 3, 32, 32)
output_shape = (batch, 10)
ins = int(np.prod(input_shape) * np.dtype(np.float32).itemsize)
outs = int(np.prod(output_shape) * np.dtype(np.float32).itemsize)
print(ins, outs)
d_input = cuda.mem_alloc(ins)
d_output = cuda.mem_alloc(outs)
acc = 0
for images, labels in tqdm(testloader):
input_data = images.numpy()
labels = labels.numpy()
# print(input_data.shape)
input_data = np.ascontiguousarray(input_data)
output_data = np.empty(output_shape, dtype=np.float32)
cuda.memcpy_htod(d_input, input_data) # to gpu
# print(input_data)
context.execute_v2(bindings=[int(d_input), int(d_output)])
# print(output_data)
cuda.memcpy_dtoh(output_data, d_output) # to cpu
# print(output_data)
pred = output_data.argmax(axis=1)
# print(pred)
acc += sum(pred==labels) #batch_size should be divided by the total loader_len
# print(acc)
print(acc / 10000)
That’s all.The biggest problem is still environment config. Oh, If you ask me why I wrote it in English, because I edit the blog on the Jetson AGX, There is no Chinese input method.