神经网络部署
基础知识
大佬提出标准,每个公司的实现都有自己的方法
神经网络
部署网络改进的优点和建议
onnxruntime 运行实例
使用 onnxruntime 加速你的网络
import onnxruntime
import numpy as np
# -------------------------------------------------------------------
# Onnxruntime 需要你提供一个 feed dict 和 output 的名字才能跑推理
# feed dict 就是 input name: data 的形式表示的输入数据
# output name 和 input name 你如果不知道的话,用可视化工具打开 onnx 文件就可以看到了。
# -------------------------------------------------------------------
MODEL = 'model.onnx'
FEED_DICT = {'input name': np.zeros(shape=[1, 3, 224, 224])}
OUTPUT_NAMES = ['output name']
session = onnxruntime.InferenceSession(MODEL, providers=['CUDAExecutionProvider'])
result = session.run(OUTPUT_NAMES, FEED_DICT)
onnx 后训练量化(PPQ)
import torch
import torch.utils.data
import torchvision
from absl import logging
# 装一下下面这个库
from pytorch_quantization import nn as quant_nn
logging.set_verbosity(logging.FATAL) # Disable logging as they are too noisy in notebook
from pytorch_quantization import quant_modules
# 调用这个 quant_modules.initialize()
# 然后你正常训练就行了 ...
quant_modules.initialize()
model = torchvision.models.resnet50()
model.cuda()
# Quantization Aware Training is based on Straight Through Estimator (STE) derivative approximation.
# It is some time known as “quantization aware training”.
# We don’t use the name because it doesn’t reflect the underneath assumption.
# If anything, it makes training being “unaware” of quantization because of the STE approximation.
# After calibration is done, Quantization Aware Training is simply select a training schedule and continue training the calibrated model.
# Usually, it doesn’t need to fine tune very long. We usually use around 10% of the original training schedule,
# starting at 1% of the initial training learning rate,
# and a cosine annealing learning rate schedule that follows the decreasing half of a cosine period,
# down to 1% of the initial fine tuning learning rate (0.01% of the initial training learning rate).
# Quantization Aware Training (Essentially a discrete numerical optimization problem) is not a solved problem mathematically.
# Based on our experience, here are some recommendations:
# For STE approximation to work well, it is better to use small learning rate.
# Large learning rate is more likely to enlarge the variance introduced by STE approximation and destroy the trained network.
# Do not change quantization representation (scale) during training, at least not too frequently.
# Changing scale every step, it is effectively like changing data format (e8m7, e5m10, e3m4, et.al) every step,
# which will easily affect convergence.
# https://github.com/NVIDIA/TensorRT/blob/main/tools/pytorch-quantization/examples/finetune_quant_resnet50.ipynb
def export_onnx(model, onnx_filename, batch_onnx):
model.eval()
quant_nn.TensorQuantizer.use_fb_fake_quant = True # We have to shift to pytorch's fake quant ops before exporting the model to ONNX
opset_version = 13
# Export ONNX for multiple batch sizes
print("Creating ONNX file: " + onnx_filename)
dummy_input = torch.randn(batch_onnx, 3, 224, 224, device='cuda') #TODO: switch input dims by model
torch.onnx.export(model, dummy_input, onnx_filename, verbose=False, opset_version=opset_version, enable_onnx_checker=False, do_constant_folding=True)
return True
onnxruntime 性能比较
# ---------------------------------------------------------------
# 这个脚本向你展示了如何使用 Onnxruntime 对 PPQ 导出的模型进行推理
# Onnxruntime 提供一系列 providers 实现不同硬件上的神经网络推理
# CPUExecutionProvider, CUDAExecutionProvider 是 Onnxruntime 官方提供的
# TensortExecutionProvider 是 Nvidia 提供的
# 不同 Provider 对模型格式有不一样的要求,PPQ 导出的是 CPUExecutionProvider 格式的模型
# Onnxruntime 没写 INT8 算子的 CUDA 实现,因此当你的模型使用 Onnxruntime 进行部署时,如果使用
# CUDAExecutionProvider, 你无需考虑量化加速
# ---------------------------------------------------------------
import torchvision
import torch
import ppq
import ppq.api as API
calibration_dataloader = [torch.rand(size=[1, 3, 224, 224]).cuda()]
model = torchvision.models.shufflenet_v2_x1_0().cuda()
with API.ENABLE_CUDA_KERNEL():
quantized = API.quantize_torch_model(
model=model, calib_dataloader=calibration_dataloader,
calib_steps=8, input_shape=[1, 3, 224, 224], platform=ppq.TargetPlatform.ONNXRUNTIME)
API.export_ppq_graph(
quantized, platform=ppq.TargetPlatform.ONNXRUNTIME,
graph_save_to='Quantized.onnx')
API.export_ppq_graph(
quantized, platform=ppq.TargetPlatform.ONNX,
graph_save_to='FP32.onnx')
from ppq.utils.OnnxruntimeUtil import Benchmark, Profile
Benchmark('FP32.onnx', providers=['CPUExecutionProvider'])
Benchmark('Quantized.onnx', providers=['CPUExecutionProvider'])
Profile('FP32.onnx', providers=['CPUExecutionProvider'])
Profile('Quantized.onnx', providers=['CPUExecutionProvider'])