torchTRT和Torch-MagicMind的使用介绍
1.torchTRT
1.1C++接口
接口列表
// Compile a TorchScript module for NVIDIA GPUs using TensorRT
torch::jit::Module compile(const torch::jit::Module& module, CompileSpec info);
// Check to see if a module is fully supported by the compiler
bool check_method_operator_support(const torch::jit::Module& module, std::string method_name);
// convert selected method to a serialized TensorRT engine
std::string convert_method_to_trt_engine(const torch::jit::Module& module, std::string method_name, CompileSpec info);
// Take a previously created TensorRT engine and embed it in a TorchScript module
torch::jit::Module embed_engine_in_new_module(const std::string& engine, Device device, const std::vector<std::string>& input_binding_names = std::vector<std::string>(), const std::vector<std::string>& output_binding_names = std::vector<std::string>());
// int8量化校准器
template <typename Algorithm = nvinfer1::IInt8EntropyCalibrator2, typename DataLoader>
inline Int8Calibrator<Algorithm, DataLoader> make_int8_calibrator(
DataLoader dataloader,
const std::string& cache_file_path,
bool use_cache) {
return Int8Calibrator<Algorithm, DataLoader>(std::move(dataloader), cache_file_path, use_cache);
}
template <typename Algorithm = nvinfer1::IInt8EntropyCalibrator2>
inline Int8CacheCalibrator<Algorithm> make_int8_cache_calibrator(const std::string& cache_file_path) {
return Int8CacheCalibrator<Algorithm>(cache_file_path);
//其他接口
std::string get_build_info();
void dump_build_info(); // TensorRT versions to stdout
void set_device(const int gpu_id);
1.2数据结构
struct CompileSpec {
CompileSpec(std::vector<std::vector<int64_t>> fixed_sizes);
CompileSpec(std::vector<c10::ArrayRef<int64_t>> fixed_sizes);
CompileSpec(std::vector<Input> inputs);
CompileSpec(torch::jit::IValue input_signature);
GraphInputs graph_inputs; // 输入相关信息
std::set<DataType> enabled_precisions = {DataType::kFloat}; // 数据精度
bool disable_tf32 = false;
bool sparse_weights = false;
bool refit = false;
bool debug = false;
bool truncate_long_and_double = false;
Device device;
EngineCapability capability = EngineCapability::kSTANDARD;
uint64_t num_avg_timing_iters = 1;
uint64_t workspace_size = 0;
uint64_t dla_sram_size = 1048576;
uint64_t dla_local_dram_size = 1073741824;
uint64_t dla_global_dram_size = 536870912;
nvinfer1::IInt8Calibrator* ptq_calibrator = nullptr; // ptq的Calibrator
bool require_full_compilation = false; // 是否要求整图
uint64_t min_block_size = 3; // 是否要求整图
std::vector<std::string> torch_executed_ops; // 强制fallback的算子
std::vector<std::string> torch_executed_modules; // 强制fallback的modules
};
struct GraphInputs {
torch::jit::IValue input_signature; // nested Input, full input spec
std::vector<Input> inputs; // flatten input spec
};
struct Input : torch::CustomClassHolder {
std::vector<int64_t> min_shape;
std::vector<int64_t> opt_shape;
std::vector<int64_t> max_shape;
std::vector<int64_t> shape;
DataType dtype;
TensorFormat format;
std::vector<double> tensor_domain;
Input() {}
TORCHTRT_API Input(std::vector<int64_t> shape, TensorFormat format = TensorFormat::kContiguous);
TORCHTRT_API Input(
std::vector<int64_t> shape,
std::vector<double> tensor_domain,
TensorFormat format = TensorFormat::kContiguous);
......
private:
friend TORCHTRT_API std::ostream& operator<<(std::ostream& os, const Input& input);
bool input_is_dynamic;
};
class TensorFormat {
public:
enum Value : int8_t {
kContiguous, /// Contiguous / NCHW / Linear
kChannelsLast, /// Channel Last / NHWC
kUnknown, /// Sentinel value
};
TensorFormat() = default;
constexpr TensorFormat(Value t) : value(t) {}
......
private:
friend TORCHTRT_API std::ostream& operator<<(std::ostream& os, const TensorFormat& format);
Value value;
};
class DataType {
public:
enum Value : int8_t {
kLong, /// INT64
kFloat, /// FP32
kHalf, /// FP16
kChar, /// INT8
kInt, /// INT
kBool, /// Bool
kUnknown /// Sentinel value
};
constexpr DataType(Value t) : value(t) {}
......
private:
friend TORCHTRT_API std::ostream& operator<<(std::ostream& os, const DataType& dtype);
Value value;
};
struct Device {
class DeviceType {
public:
enum Value : int8_t {
kGPU, /// Target GPU to run engine
kDLA, /// Target DLA to run engine
};
DeviceType() = default;
constexpr DeviceType(Value t) : value(t) {}
......
private:
Value value;
};
DeviceType device_type;
int64_t gpu_id;
int64_t dla_core; // DLA
bool allow_gpu_fallback;
Device() : device_type(DeviceType::kGPU), gpu_id(0), dla_core(0), allow_gpu_fallback(false) {}
};
1.3简单用例
C++
#include "cpp_api_test.h"
TEST_P(CppAPITests, CompiledModuleIsClose) {
std::vector<torch::jit::IValue> jit_inputs_ivalues;
std::vector<torch::jit::IValue> trt_inputs_ivalues;
std::vector<torch_tensorrt::Input> shapes;
for (uint64_t i = 0; i < input_shapes.size(); i++) {
auto in = at::randint(5, input_shapes[i], {at::kCUDA}).to(input_types[i]);
jit_inputs_ivalues.push_back(in.clone());
trt_inputs_ivalues.push_back(in.clone());
auto in_spec = torch_tensorrt::Input(input_shapes[i]);
in_spec.dtype = input_types[i];
shapes.push_back(in_spec);
std::cout << in_spec << std::endl;
}
torch::jit::IValue jit_results_ivalues = torch_tensorrt::tests::util::RunModuleForward(mod, jit_inputs_ivalues);
std::vector<at::Tensor> jit_results;
if (jit_results_ivalues.isTuple()) {
auto tuple = jit_results_ivalues.toTuple();
for (auto t : tuple->elements()) {
jit_results.push_back(t.toTensor());
}
} else {
jit_results.push_back(jit_results_ivalues.toTensor());
}
auto spec = torch_tensorrt::ts::CompileSpec(shapes); // 生成CompileSpec
spec.truncate_long_and_double = true;
auto trt_mod = torch_tensorrt::ts::compile(mod, spec); // 编译优化、推理
torch::jit::IValue trt_results_ivalues = torch_tensorrt::tests::util::RunModuleForward(trt_mod, trt_inputs_ivalues);
std::vector<at::Tensor> trt_results;
if (trt_results_ivalues.isTuple()) {
auto tuple = trt_results_ivalues.toTuple();
for (auto t : tuple->elements()) {
trt_results.push_back(t.toTensor());
}
} else {
trt_results.push_back(trt_results_ivalues.toTensor());
}
for (size_t i = 0; i < trt_results.size(); i++) {
ASSERT_TRUE(torch_tensorrt::tests::util::cosineSimEqual(jit_results[i], trt_results[i].reshape_as(jit_results[i])));
}
}
#ifndef DISABLE_TEST_IN_CI
INSTANTIATE_TEST_SUITE_P(
CompiledModuleForwardIsCloseSuite,
CppAPITests,
testing::Values(
PathAndInput({"tests/modules/resnet18_scripted.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}}),
PathAndInput({"tests/modules/mobilenet_v2_scripted.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}}),
PathAndInput({"tests/modules/efficientnet_b0_scripted.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}}),
PathAndInput({"tests/modules/bert_base_uncased_traced.jit.pt", {{1, 14}, {1, 14}}, {at::kInt, at::kInt}}),
PathAndInput({"tests/modules/vit_scripted.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}})));
#endif
Python
class TestAccuracy(unittest.TestCase):
def test_compile_script(self):
self.model = (
torch.jit.load(MODULE_DIR + "/trained_vgg16.jit.pt").eval().to("cuda")
)
self.input = torch.randn((1, 3, 32, 32)).to("cuda")
self.testing_dataset = torchvision.datasets.CIFAR10(
root="./data",
train=False,
download=True,
transform=transforms.Compose(
[
transforms.ToTensor(),
transforms.Normalize(
(0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)
),
]
),
)
// 数据集
self.testing_dataloader = torch.utils.data.DataLoader(
self.testing_dataset, batch_size=1, shuffle=False, num_workers=1
)
# Test cases can assume using GPU id: 0
self.calibrator = TRTEntropyCalibrator(self.testing_dataloader)
fp32_test_acc = compute_accuracy(self.testing_dataloader, self.model)
log(Level.Info, "[Pyt FP32] Test Acc: {:.2f}%".format(100 * fp32_test_acc))
compile_spec = {
"inputs": [torchtrt.Input([1, 3, 32, 32])],
"enabled_precisions": {torch.float, torch.int8},
"calibrator": self.calibrator,
"truncate_long_and_double": True,
"device": {
"device_type": torchtrt.DeviceType.GPU,
"gpu_id": 0,
"dla_core": 0,
"allow_gpu_fallback": False,
},
}
trt_mod = torchtrt.ts.compile(self.model, **compile_spec)
int8_test_acc = compute_accuracy(self.testing_dataloader, trt_mod)
log(Level.Info, "[TRT INT8] Test Acc: {:.2f}%".format(100 * int8_test_acc))
acc_diff = fp32_test_acc - int8_test_acc
self.assertTrue(abs(acc_diff) < 3)
1.2寒武纪 PyTorch-MagicMind
1.2.1简单用例
import torch
import torch_mlu
import torchvision.models as models
from torchvision import transforms
from torchvision import datasets
torch.set_grad_enabled(False)
model = models.resnet50(pretrained=False).eval().float()
input_t = torch.randn(1, 3, 224, 224).float()
val_loader = torch.utils.data.DataLoader(
datasets.FakeData(size=20, num_classes=1000,
transform= transforms.Compose([transforms.ToTensor()])),
batch_size=1, shuffle=False,
num_workers=4, pin_memory=True)
calibrator = torch_mlu.ptq.DataLoaderCalibrator(
val_loader,
algo_type=torch_mlu.ptq.CalibrationAlgo.LINEAR_CALIBRATION,
max_calibration_samples=5)
compile_spec = {
"inputs": [torch_mlu.Input(input_t.shape, dtype=torch.float, format=torch.contiguous_format)],
"device": {"mlu_id": 0},
"enabled_precisions": {torch.float, torch.int8},
"calibrator": calibrator
}
traced_model = torch.jit.trace(model, input_t, check_trace=False)
compiled_model = torch_mlu.ts.compile(traced_model, **compile_spec)
out_mlu = compiled_model(input_t.to('mlu'))