在部署onnxruntime的时候发现在对tensor进行slice 的时候,加不加contiguous()对结果的影响非常大。了解之后发现之所以会导致结果的巨大差异,是因为内存布局的处理方式不同。
当在 torch 中对一个张量进行切片时,结果张量可能并不是连续的内存块。这意味着这个张量是原始张量的一个视图,通过步长来实现切片布局,我觉得类似用view观察数据,不改变数据底层排列。
在进行张量切片、转置等操作后,如果不调用 contiguous()
,将这个非连续的张量传递给一个期望连续内存的函数(例如某些 ONNX 推理操作,ONNX Runtime 和其他推理引擎通常假设输入数据是连续的),ONNX 导出可能会遇到内存布局不一致的问题,导致推理阶段的数据不正确或效率低下。
调用 .contiguous() 会确保张量以标准的内存布局(在 PyTorch 中是行优先顺序)存储在一个连续的内存块中。
顺便记录一下onnxruntime + c++实现的滑动窗口预测
torch::Tensor OnnxInference(const ORTCHAR_T* onnx_path, const itk::Size<3>& image_size,const torch::Tensor& image_tensor, float step_size = 0.9) {
//Session 构造
Ort::Env env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING, "test");
Ort::SessionOptions session_options;
OrtCUDAProviderOptions cuda_options{};
session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
session_options.AppendExecutionProvider_CUDA(cuda_options);
std::cout << "CUDA Provider has been appended." << std::endl;
Ort::Session session(env, onnx_path, session_options);
Ort::AllocatorWithDefaultOptions allocator;
//输入输出信息
auto [numInputNodes, inputNameStr, inputType, inputDims] = GetInputInfo(session, allocator);
auto [numOutputNodes, outputNameStr, outputType, outputDims] = GetOutputInfo(session, allocator);
//输入输出
size_t inputTensorSize = vectorProduct(inputDims);
std::cout << "Size of inputTensorSize: " << inputTensorSize << std::endl;
std::vector<const char*> inputNames{ "input" };
size_t outputTensorSize = vectorProduct(outputDims);
std::vector<float> outputTensorValues(outputTensorSize);
std::vector<const char*> outputNames{ "output" };
Ort::MemoryInfo memoryInfo = Ort::MemoryInfo::CreateCpu(
OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
//Warm up
auto startt = std::chrono::high_resolution_clock::now();
std::cout << "Starting warm-up..." << std::endl;
std::vector<float> dummyInput(inputTensorSize, 0.0f);
std::vector<Ort::Value> warmupInputTensors;
warmupInputTensors.push_back(Ort::Value::CreateTensor<float>(
memoryInfo, dummyInput.data(), inputTensorSize,
inputDims.data(), inputDims.size()));
std::vector<float> dummyOutput(outputTensorSize, 0.0f);
std::vector<Ort::Value> warmupOutputTensors;
warmupOutputTensors.push_back(Ort::Value::CreateTensor<float>(
memoryInfo, dummyOutput.data(), outputTensorSize,
outputDims.data(), outputDims.size()));
for (int i = 0; i < 8; ++i) {
session.Run(Ort::RunOptions{ nullptr }, inputNames.data(),
warmupInputTensors.data(), warmupInputTensors.size(),
outputNames.data(), warmupOutputTensors.data(), warmupOutputTensors.size());
}
auto endt = std::chrono::high_resolution_clock::now();
std::cout << "Warm-up time: " << duration_cast<milliseconds>(endt - startt).count() / 1000.0 << std::endl;
std::cout << "Warm-up completed." << std::endl;
//计算滑动窗口步数
std::vector<int64_t> patch_size(inputDims.begin() + 2, inputDims.end()); //128, 196, 196
std::vector<std::vector<int64_t>> steps = Compute_steps_for_sliding_window(patch_size, image_size, step_size);
//打印steps
//std::cout << "Computing steps" << std::endl;
//for (int dim = 0; dim < 3; ++dim) {
// std::cout << "Dimension " << dim << " steps: ";
// for (int i = 0; i < steps[dim].size(); ++i) {
// std::cout << steps[dim][i] << " ";
// }
// std::cout << std::endl;
//}
int num_tiles = steps[0].size() * steps[1].size() * steps[2].size();
std::cout << "Num_tiles :" << num_tiles << std::endl;
std::cout << std::endl;
torch::Tensor aggregated_results = torch::empty({ 1, outputDims[1], static_cast<long long>(image_size[0]), static_cast<long long>(image_size[1]),
static_cast<long long>(image_size[2])}, torch::kFloat32);
std::vector<long long> durations;
for (int x : steps[0]) {
int lb_x = x;
int ub_x = x + patch_size[0];
for (int y : steps[1]) {
int lb_y = y;
int ub_y = y + patch_size[1];
for (int z : steps[2]) {
int lb_z = z;
int ub_z = z + patch_size[2];
auto start = std::chrono::high_resolution_clock::now();
torch::Tensor sliced_tensor = image_tensor.slice(2, lb_x, ub_x)
.slice(3, lb_y, ub_y)
.slice(4, lb_z, ub_z)
.contiguous()
.to(torch::kFloat32);
std::vector<float> inputTensorValues(sliced_tensor.data_ptr<float>(),
sliced_tensor.data_ptr<float>() + sliced_tensor.numel());
std::vector<Ort::Value> inputTensors;
inputTensors.push_back(Ort::Value::CreateTensor<float>(
memoryInfo, inputTensorValues.data(), inputTensorSize,
inputDims.data(), inputDims.size()));
std::vector<Ort::Value> outputTensors;
outputTensors.push_back(Ort::Value::CreateTensor<float>(
memoryInfo, outputTensorValues.data(), outputTensorSize,
outputDims.data(), outputDims.size()));
session.Run(Ort::RunOptions{ nullptr }, inputNames.data(),
inputTensors.data(), inputTensors.size(),
outputNames.data(), outputTensors.data(), outputTensors.size());
float* predicted_patch = outputTensors[0].GetTensorMutableData<float>(); //1,2,128,192,192
torch::Tensor pred = torch::from_blob(predicted_patch, outputDims, torch::kFloat32).contiguous().clone();
aggregated_results
.slice(2, lb_x, ub_x)
.slice(3, lb_y, ub_y)
.slice(4, lb_z, ub_z) = pred;
auto end = std::chrono::high_resolution_clock::now();
std::cout << "one tile: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() / 1000.0 << std::endl;
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
durations.push_back(duration);
}
}
}
// Calculate average duration
long long sum = std::accumulate(durations.begin(), durations.end(), 0LL);
double average_duration = sum / static_cast<double>(num_tiles);
std::cout << "All prediction time over " << num_tiles << " runs: " << sum / 1000.0 << " seconds." << std::endl;
std::cout << "Average prediction time over " << num_tiles << " runs: " << average_duration / 1000.0 << " seconds." << std::endl;
auto start_arg = std::chrono::high_resolution_clock::now();
auto argmax_tensor = torch::argmax(aggregated_results.to(torch::kCUDA),1, true).squeeze();
std::cout << argmax_tensor.sizes() << std::endl;
auto end_arg = std::chrono::high_resolution_clock::now();
std::cout << "Argmax time: " << duration_cast<std::chrono::milliseconds>(end_arg - start_arg).count() / 1000.0 << std::endl;
std::cout << "Prediction done" << std::endl;
return argmax_tensor.to(torch::kCPU).to(torch::kFloat32);
}
float16 版
torch::Tensor OnnxInference_f16(const ORTCHAR_T* onnx_path, const itk::Size<3>& image_size, const at::IntArrayRef& ori_dims, const torch::Tensor& image_tensor, const torch::Tensor& gmap, float step_size) {
auto start1 = std::chrono::high_resolution_clock::now();
Ort::Env env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING, "test");
Ort::SessionOptions session_options;
OrtCUDAProviderOptions cuda_options{};
session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
session_options.AppendExecutionProvider_CUDA(cuda_options);
std::cout << "CUDA Provider has been appended." << std::endl;
Ort::Session session(env, onnx_path, session_options);
Ort::AllocatorWithDefaultOptions allocator;
//auto endt1 = std::chrono::high_resolution_clock::now();
//auto start1 = std::chrono::high_resolution_clock::now();
auto [numInputNodes, inputNameStr, inputType, inputDims] = GetInputInfo(session, allocator);
auto [numOutputNodes, outputNameStr, outputType, outputDims] = GetOutputInfo(session, allocator);
size_t inputTensorSize = vectorProduct(inputDims);
//std::cout << "Size of inputTensorSize: " << inputTensorSize << std::endl;
std::vector<const char*> inputNames{ "input" };
size_t outputTensorSize = vectorProduct(outputDims);
std::vector<Ort::Float16_t> outputTensorValues(outputTensorSize);
std::vector<const char*> outputNames{ "output" };
Ort::MemoryInfo memoryInfo = Ort::MemoryInfo::CreateCpu(
OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
std::vector<int64_t> patch_size(inputDims.begin() + 2, inputDims.end());
std::vector<std::vector<int64_t>> steps = Compute_steps_for_sliding_window(patch_size, image_size, step_size);
int num_tiles = steps[0].size() * steps[1].size() * steps[2].size();
std::cout << "Num_tiles :" << num_tiles << std::endl;
std::cout << std::endl;
torch::Tensor aggregated_results = torch::empty({ 1, outputDims[1], static_cast<long long>(image_size[0]), static_cast<long long>(image_size[1]),
static_cast<long long>(image_size[2]) }, torch::kFloat16).to(torch::kCUDA);
auto endt1 = std::chrono::high_resolution_clock::now();
std::cout << "load model time: " << duration_cast<milliseconds>(endt1 - start1).count() / 1000.0 << std::endl;
std::cout << "model load completed." << std::endl;
std::cout << std::endl;
torch::Tensor aggregated_nb_of_predictions = torch::empty({ 1, outputDims[1], static_cast<long long>(image_size[0]), static_cast<long long>(image_size[1]),
static_cast<long long>(image_size[2]) }, torch::kFloat16).to(torch::kCUDA);
std::vector<long long> durations;
for (int x : steps[0]) {
int lb_x = x;
int ub_x = x + patch_size[0];
for (int y : steps[1]) {
int lb_y = y;
int ub_y = y + patch_size[1];
for (int z : steps[2]) {
int lb_z = z;
int ub_z = z + patch_size[2];
auto start_p = std::chrono::high_resolution_clock::now();
torch::Tensor sliced_tensor = image_tensor.slice(2, lb_x, ub_x)
.slice(3, lb_y, ub_y)
.slice(4, lb_z, ub_z)
.contiguous()
.to(torch::kFloat16);
std::vector<Ort::Float16_t> inputTensorValues(sliced_tensor.data_ptr<at::Half>(),
sliced_tensor.data_ptr<at::Half>() + sliced_tensor.numel());
std::vector<Ort::Value> inputTensors;
inputTensors.push_back(Ort::Value::CreateTensor<Ort::Float16_t>(
memoryInfo, inputTensorValues.data(), inputTensorSize,
inputDims.data(), inputDims.size()));
std::vector<Ort::Value> outputTensors;
outputTensors.push_back(Ort::Value::CreateTensor<Ort::Float16_t>(
memoryInfo, outputTensorValues.data(), outputTensorSize,
outputDims.data(), outputDims.size()));
session.Run(Ort::RunOptions{ nullptr }, inputNames.data(),
inputTensors.data(), inputTensors.size(),
outputNames.data(), outputTensors.data(), outputTensors.size());
auto predicted_patch = outputTensors[0].GetTensorMutableData<Ort::Float16_t>();
torch::Tensor pred = torch::from_blob(predicted_patch, outputDims, torch::kFloat16).contiguous().to(torch::kCUDA);
aggregated_results
.slice(2, lb_x, ub_x)
.slice(3, lb_y, ub_y)
.slice(4, lb_z, ub_z)
.add_(pred * gmap);
//aggregated_nb_of_predictions
// .slice(2, lb_x, ub_x)
// .slice(3, lb_y, ub_y)
// .slice(4, lb_z, ub_z) += gmap;
auto end_p = std::chrono::high_resolution_clock::now();
std::cout << "one tile: " << std::chrono::duration_cast<std::chrono::milliseconds>(end_p - start_p).count() / 1000.0 << std::endl;
auto duration_p = std::chrono::duration_cast<std::chrono::milliseconds>(end_p - start_p).count();
durations.push_back(duration_p);
}
}
}
//aggregated_results /= aggregated_nb_of_predictions;
long long sum = std::accumulate(durations.begin(), durations.end(), 0LL);
std::cout << "All prediction time over " << num_tiles << " runs: " << sum / 1000.0 << " seconds." << std::endl;
namespace F = torch::nn::functional;
std::vector<int64_t> size_vec = { static_cast<int64_t>(ori_dims[0]),
static_cast<int64_t>(ori_dims[1]),
static_cast<int64_t>(ori_dims[2]) };
auto start_ = std::chrono::high_resolution_clock::now();
torch::Tensor result = torch::softmax(aggregated_results.to(torch::kCUDA), 1);
torch::Tensor reampled_tensor = F::interpolate(
aggregated_results,
F::InterpolateFuncOptions().size(size_vec).mode(torch::kTrilinear).align_corners(false)
);
torch::cuda::synchronize();
auto end_ = std::chrono::high_resolution_clock::now();
std::cout << "r: " << std::chrono::duration_cast<std::chrono::milliseconds>(end_ - start_).count() / 1000.0 << std::endl;
auto s = std::chrono::high_resolution_clock::now();
auto argmax_tensor = torch::argmax(reampled_tensor, 1, true).squeeze();
torch::cuda::synchronize();
auto e = std::chrono::high_resolution_clock::now();
std::cout << "p1: " << std::chrono::duration_cast<std::chrono::milliseconds>(e - s).count() / 1000.0 << std::endl;
auto start_arg = std::chrono::high_resolution_clock::now();
torch::cuda::synchronize();
auto end_arg = std::chrono::high_resolution_clock::now();
std::cout << "p2: " << std::chrono::duration_cast<std::chrono::milliseconds>(end_arg - start_arg).count() / 1000.0 << std::endl;
std::cout << "Prediction done" << std::endl;
auto argmax_tensor_cpu = argmax_tensor.detach().cpu();
return argmax_tensor_cpu;
}