# 自定义量化优化过程,如何手动调用优化过程
from typing import Iterable,Callable
import torch
import torchvision
# QuantizationOptimizationPipeline:位于ppq/quantization/optim/base.py
from ppq import (BaseGraph,QuantizationOptimizationPass,
QuantizationOptimizationPipeline,QuantizationSetting,
TargetPlatform,TorchExecutor)
from ppq.api import ENABLE_CUDA_KERNEL
from ppq.executor.torch import TorchExecutor
from ppq.IR.quantize import QuantableOperation
from ppq.IR.search import SearchableGraph
from ppq.quantization.optim import (ParameterQuantizePass,
PassiveParameterQuantizePass,
QuantAlignmentPass,
QuantizeSimplifyPass,
RuntimeCalibrationPass)
from ppq.quantization.quantizer import TensorRTQuantizer
BATCHSIZE= 32
INPUT_SHAPE = [BATCHSIZE,3,224,224]
DEVICE = 'cuda'
PLATFORM = TargetPlatform.TRT_INT8
# 创建calibration数据,并加载模型
def load_calibration_dataset() -> Iterable:
return [torch.rand(size=INPUT_SHAPE) for _ in range (32)]
CALIBRATION = load_calibration_dataset()
def collate_fn(batch: torch.Tensor) -> torch.Tensor:
return batch.to(DEVICE)
model = torchvision.models.mobilenet.mobilenet_v2(pretrained=True)
model = model.to(DEVICE)
# 下面将展示自定义图融合过程
# 图融合过程将改变量化方案,PPQ使用Tensor Quantization Config来描述图融合的具体规则,其底层由并查集进行实现
# 定义自己的图融合过程,在这里将尝试Conv-Clip的融合
# 但与平常不同的是,关闭Clip之后的量化点,保留Conv-Clip中间的量化
# 对于更为复杂的模式匹配,可以参考ppq.quantization.optim.refine.SwishFusionPass
class MyFusion(QuantizationOptimizationPass):
def optimize(self, graph:BaseGraph, dataloader: Iterable,collate_fn: Callable, executor: TorchExecutor, **kwargs) -> None:
# 图融合过程往往由图模式匹配开始,建立一个模式匹配引擎
search_engine = SearchableGraph(graph=graph)
for pattern in search_engine.pattern_matching(patterns=['Conv','Clip'],edges=[[0,1]],exclusive=True):
conv,relu = pattern
# 匹配到图中的conv-relu对,接下来关闭不必要的量化点
# 首先检查conv-relu是否都是量化算子,是否处于同一平台
is_quantable = isinstance(conv,QuantableOperation) and isinstance (relu,QuantableOperation)
is_same_plat = conv.platform == relu.platform
if is_quantable and is_same_plat:
# 将relu输入输出的量化全部指向conv输出
# 一旦调用dominated_by完成赋值,则调用dominated_by的同时
# PPQ会将relu.input_quant_config[0]与relu.output_quant_config[0]的状态置为OVERLAPPED
# 在后续运算中,他们所对应的量化不再起作用
relu.input_quant_config[0].dominated_by = conv.output_quant_config[0]
relu.output_quant_config[0].dominated_by = conv.output_quant_config[0]
# 自定义图融合过程将会干预量化器逻辑,需要重新建立量化器
# 此处继承TensorRT Quantizer,算子的量化逻辑将使用TensorRT的配置
# 但在生成量化管线时,将覆盖量化器原有的逻辑,使用自定义管线
# 这样可以把自定义的图融合过程放置在合适的位置上,此时QuantizationSetting也不再起作用
class MyQuantizer(TensorRTQuantizer):
# QuantizationOptimizationPipiline:量化器将从管道中一个一个调用优化管道,最终完成网络量化
def build_quant_pipeline(self, setting: QuantizationSetting) -> QuantizationOptimizationPipeline:
return QuantizationOptimizationPipeline([
# 根据图的连接关系关闭冗余的量化信息,它们的状态将被设置为FP32,并指向一个父量化节点
QuantizeSimplifyPass(),
# 参数量化,默认情况下所有具有初始状态的参数都将被量化
Parameter QuantizePass(),
MyFusion(name='My Optimization Procedure'),
# 该过程收集激活值的统计信息,从而为网络中的激活值创建scale和offset,并将它们的状态设置为active
RuntimeCalibrationPass(),
# 该过程会处理concat,add,averagepooling等算子的输入输出对齐,它们的状态将被设置为PASSIVE,并指向一个父量化节点
QuantAlignmentPass(),
# 通用被动量化过程
PassiveParameterQuantizePass()])
from ppq.api import quantize_torch_model,register_network_quantizer
register_network_quantizer(quantizer=MyQuantizer,platform=TargetPlatform.EXTENSION)
with ENABLE_CUDA_KERNEL():
quantized = quantize_torch_model(
model=model,calib_dataloader=CALIBRATION,
calib_steps=32,input_shape=INPUT_SHAPE,
collate_fn=collate_fn,platform=TargetPlatform.EXTENSION,
onnx_export_file='Output/model.onnx',device=DEVICE,verbose=0)
PPQ中fusion的使用demo
于 2023-06-28 10:50:16 首次发布