torchTRT和Torch-MagicMind的使用介绍

小铁匠_LR

已于 2023-03-21 23:08:28 修改

阅读量337

点赞数

文章标签：深度学习

于 2023-03-21 23:07:45 首次发布

本文链接：https://blog.csdn.net/qq_35494379/article/details/129698329

版权

torchTRT和Torch-MagicMind的使用介绍

1.torchTRT
1.2寒武纪 PyTorch-MagicMind
- 1.2.1简单用例

1.torchTRT

1.1C++接口

接口列表

// Compile a TorchScript module for NVIDIA GPUs using TensorRT
torch::jit::Module compile(const torch::jit::Module& module, CompileSpec info);

// Check to see if a module is fully supported by the compiler
bool check_method_operator_support(const torch::jit::Module& module, std::string method_name);

// convert selected method to a serialized TensorRT engine
std::string convert_method_to_trt_engine(const torch::jit::Module& module, std::string method_name, CompileSpec info);

// Take a previously created TensorRT engine and embed it in a TorchScript module
torch::jit::Module embed_engine_in_new_module(const std::string& engine, Device device, const std::vector<std::string>& input_binding_names = std::vector<std::string>(), const std::vector<std::string>& output_binding_names = std::vector<std::string>());

// int8量化校准器
template <typename Algorithm = nvinfer1::IInt8EntropyCalibrator2, typename DataLoader>
inline Int8Calibrator<Algorithm, DataLoader> make_int8_calibrator(
    DataLoader dataloader,
    const std::string& cache_file_path,
    bool use_cache) {
  return Int8Calibrator<Algorithm, DataLoader>(std::move(dataloader), cache_file_path, use_cache);
}
template <typename Algorithm = nvinfer1::IInt8EntropyCalibrator2>
inline Int8CacheCalibrator<Algorithm> make_int8_cache_calibrator(const std::string& cache_file_path) {
  return Int8CacheCalibrator<Algorithm>(cache_file_path);

//其他接口
std::string get_build_info();
void dump_build_info(); // TensorRT versions to stdout
void set_device(const int gpu_id);

量化基本知识

1.2数据结构

struct CompileSpec {
  CompileSpec(std::vector<std::vector<int64_t>> fixed_sizes);
  CompileSpec(std::vector<c10::ArrayRef<int64_t>> fixed_sizes);
  CompileSpec(std::vector<Input> inputs);
  CompileSpec(torch::jit::IValue input_signature);
  GraphInputs graph_inputs; // 输入相关信息
  std::set<DataType> enabled_precisions = {DataType::kFloat}; // 数据精度
  bool disable_tf32 = false;
  bool sparse_weights = false;
  bool refit = false;
  bool debug = false;
  bool truncate_long_and_double = false;
  Device device;
  EngineCapability capability = EngineCapability::kSTANDARD;
  uint64_t num_avg_timing_iters = 1;
  uint64_t workspace_size = 0;
  uint64_t dla_sram_size = 1048576;
  uint64_t dla_local_dram_size = 1073741824;
  uint64_t dla_global_dram_size = 536870912;
  nvinfer1::IInt8Calibrator* ptq_calibrator = nullptr; // ptq的Calibrator
  bool require_full_compilation = false; // 是否要求整图
  uint64_t min_block_size = 3; // 是否要求整图
  std::vector<std::string> torch_executed_ops; // 强制fallback的算子
  std::vector<std::string> torch_executed_modules; // 强制fallback的modules
};

struct GraphInputs {
  torch::jit::IValue input_signature; // nested Input, full input spec
  std::vector<Input> inputs; // flatten input spec
};

struct Input : torch::CustomClassHolder {
  std::vector<int64_t> min_shape;
  std::vector<int64_t> opt_shape;
  std::vector<int64_t> max_shape;
  std::vector<int64_t> shape;
  DataType dtype;
  TensorFormat format;
  std::vector<double> tensor_domain;
  Input() {}
  TORCHTRT_API Input(std::vector<int64_t> shape, TensorFormat format = TensorFormat::kContiguous);
  TORCHTRT_API Input(
      std::vector<int64_t> shape,
      std::vector<double> tensor_domain,
      TensorFormat format = TensorFormat::kContiguous);
 ......
 private:
  friend TORCHTRT_API std::ostream& operator<<(std::ostream& os, const Input& input);
  bool input_is_dynamic;
};

class TensorFormat {
 public:
  enum Value : int8_t {
    kContiguous,     /// Contiguous / NCHW / Linear
    kChannelsLast,     /// Channel Last / NHWC
    kUnknown,    /// Sentinel value
  };
  TensorFormat() = default;
  constexpr TensorFormat(Value t) : value(t) {}
......
 private:
  friend TORCHTRT_API std::ostream& operator<<(std::ostream& os, const TensorFormat& format);
  Value value;
};

class DataType {
 public:
  enum Value : int8_t {
    kLong,  /// INT64
    kFloat,  /// FP32
    kHalf,     /// FP16
    kChar,     /// INT8
    kInt,     /// INT
    kBool,     /// Bool
    kUnknown     /// Sentinel value
  };
  constexpr DataType(Value t) : value(t) {}
......
 private:
  friend TORCHTRT_API std::ostream& operator<<(std::ostream& os, const DataType& dtype);
  Value value;
};

struct Device {
  class DeviceType {
   public:
    enum Value : int8_t {
      kGPU,       /// Target GPU to run engine
      kDLA,       /// Target DLA to run engine
    };

    DeviceType() = default;
    constexpr DeviceType(Value t) : value(t) {}
......
   private:
    Value value;
  };
  DeviceType device_type;
  int64_t gpu_id;
  int64_t dla_core; // DLA
  bool allow_gpu_fallback;
  Device() : device_type(DeviceType::kGPU), gpu_id(0), dla_core(0), allow_gpu_fallback(false) {}
};

1.3简单用例

C++

#include "cpp_api_test.h"

TEST_P(CppAPITests, CompiledModuleIsClose) {
  std::vector<torch::jit::IValue> jit_inputs_ivalues;
  std::vector<torch::jit::IValue> trt_inputs_ivalues;
  std::vector<torch_tensorrt::Input> shapes;
  for (uint64_t i = 0; i < input_shapes.size(); i++) {
    auto in = at::randint(5, input_shapes[i], {at::kCUDA}).to(input_types[i]);
    jit_inputs_ivalues.push_back(in.clone());
    trt_inputs_ivalues.push_back(in.clone());
    auto in_spec = torch_tensorrt::Input(input_shapes[i]);
    in_spec.dtype = input_types[i];
    shapes.push_back(in_spec);
    std::cout << in_spec << std::endl;
  }

  torch::jit::IValue jit_results_ivalues = torch_tensorrt::tests::util::RunModuleForward(mod, jit_inputs_ivalues);
  std::vector<at::Tensor> jit_results;
  if (jit_results_ivalues.isTuple()) {
    auto tuple = jit_results_ivalues.toTuple();
    for (auto t : tuple->elements()) {
      jit_results.push_back(t.toTensor());
    }
  } else {
    jit_results.push_back(jit_results_ivalues.toTensor());
  }

  auto spec = torch_tensorrt::ts::CompileSpec(shapes); // 生成CompileSpec
  spec.truncate_long_and_double = true;

  auto trt_mod = torch_tensorrt::ts::compile(mod, spec); // 编译优化、推理
  torch::jit::IValue trt_results_ivalues = torch_tensorrt::tests::util::RunModuleForward(trt_mod, trt_inputs_ivalues);
  std::vector<at::Tensor> trt_results;
  if (trt_results_ivalues.isTuple()) {
    auto tuple = trt_results_ivalues.toTuple();
    for (auto t : tuple->elements()) {
      trt_results.push_back(t.toTensor());
    }
  } else {
    trt_results.push_back(trt_results_ivalues.toTensor());
  }

  for (size_t i = 0; i < trt_results.size(); i++) {
    ASSERT_TRUE(torch_tensorrt::tests::util::cosineSimEqual(jit_results[i], trt_results[i].reshape_as(jit_results[i])));
  }
}

#ifndef DISABLE_TEST_IN_CI

INSTANTIATE_TEST_SUITE_P(
    CompiledModuleForwardIsCloseSuite,
    CppAPITests,
    testing::Values(
        PathAndInput({"tests/modules/resnet18_scripted.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}}),
        PathAndInput({"tests/modules/mobilenet_v2_scripted.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}}),
        PathAndInput({"tests/modules/efficientnet_b0_scripted.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}}),
        PathAndInput({"tests/modules/bert_base_uncased_traced.jit.pt", {{1, 14}, {1, 14}}, {at::kInt, at::kInt}}),
        PathAndInput({"tests/modules/vit_scripted.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}})));

#endif

Python

class TestAccuracy(unittest.TestCase):
    def test_compile_script(self):
        self.model = (
            torch.jit.load(MODULE_DIR + "/trained_vgg16.jit.pt").eval().to("cuda")
        )
        self.input = torch.randn((1, 3, 32, 32)).to("cuda")
        self.testing_dataset = torchvision.datasets.CIFAR10(
            root="./data",
            train=False,
            download=True,
            transform=transforms.Compose(
                [
                    transforms.ToTensor(),
                    transforms.Normalize(
                        (0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)
                    ),
                ]
            ),
        )
        // 数据集
        self.testing_dataloader = torch.utils.data.DataLoader(
            self.testing_dataset, batch_size=1, shuffle=False, num_workers=1
        )
        # Test cases can assume using GPU id: 0
        self.calibrator = TRTEntropyCalibrator(self.testing_dataloader)

        fp32_test_acc = compute_accuracy(self.testing_dataloader, self.model)
        log(Level.Info, "[Pyt FP32] Test Acc: {:.2f}%".format(100 * fp32_test_acc))

        compile_spec = {
            "inputs": [torchtrt.Input([1, 3, 32, 32])],
            "enabled_precisions": {torch.float, torch.int8},
            "calibrator": self.calibrator,
            "truncate_long_and_double": True,
            "device": {
                "device_type": torchtrt.DeviceType.GPU,
                "gpu_id": 0,
                "dla_core": 0,
                "allow_gpu_fallback": False,
            },
        }

        trt_mod = torchtrt.ts.compile(self.model, **compile_spec)
        int8_test_acc = compute_accuracy(self.testing_dataloader, trt_mod)
        log(Level.Info, "[TRT INT8] Test Acc: {:.2f}%".format(100 * int8_test_acc))
        acc_diff = fp32_test_acc - int8_test_acc
        self.assertTrue(abs(acc_diff) < 3)

1.2寒武纪 PyTorch-MagicMind

寒武纪文档

1.2.1简单用例

import torch
import torch_mlu
import torchvision.models as models
from torchvision import transforms
from torchvision import datasets

torch.set_grad_enabled(False)
model = models.resnet50(pretrained=False).eval().float()
input_t = torch.randn(1, 3, 224, 224).float()
val_loader = torch.utils.data.DataLoader(
    datasets.FakeData(size=20, num_classes=1000,
                      transform= transforms.Compose([transforms.ToTensor()])),
    batch_size=1, shuffle=False,
    num_workers=4, pin_memory=True)
calibrator = torch_mlu.ptq.DataLoaderCalibrator(
    val_loader,
    algo_type=torch_mlu.ptq.CalibrationAlgo.LINEAR_CALIBRATION,
    max_calibration_samples=5)
compile_spec = {
    "inputs": [torch_mlu.Input(input_t.shape, dtype=torch.float, format=torch.contiguous_format)],
    "device": {"mlu_id": 0},
    "enabled_precisions": {torch.float, torch.int8},
    "calibrator": calibrator
}

traced_model = torch.jit.trace(model, input_t, check_trace=False)
compiled_model = torch_mlu.ts.compile(traced_model, **compile_spec)
out_mlu = compiled_model(input_t.to('mlu'))