[笔记]TVM部署AirFace

最新推荐文章于 2024-07-12 17:12:57 发布

BokyLiu

最新推荐文章于 2024-07-12 17:12:57 发布

阅读量1k

点赞数 3

分类专栏：模型优化与部署

本文链接：https://blog.csdn.net/tfu259/article/details/103508400

版权

模型优化与部署专栏收录该内容

14 篇文章 1 订阅

订阅专栏

使用TVM在Tx2 Arm上部署AirFace c++

TVM一个设计亮点在于他可以在PC端通过RPC优化网络，这个大大加快了优化速度。
虽说PC端加速优化过程，但是在实际使用中发现优化速度还是很慢的，也是一个炼丹过程。而且极端依赖CPU性能，在TVM给出的例子都是用32线程服务器进行的优化。顺便说一句，TVM在自动优化的时候最大使用的线程数等于CPU的线程数。

根据FrozenGene说的，arm目前还不能用graph tune。
话不多说，上代码：

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
"""
Auto-tuning a convolutional network for ARM CPU
===============================================
**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, `Zhao Wu <https://github.com/FrozenGene>`_, `Eddie Yan <https://github.com/eqy>`_

Auto-tuning for a specific ARM device is critical for getting the best
performance. This is a tutorial about how to tune a whole convolutional
network.

The operator implementation for ARM CPU in TVM is written in template form.
The template has many tunable knobs (tile factor, vectorization, unrolling, etc).
We will tune all convolution and depthwise convolution operators
in the neural network. After tuning, we produce a log file which stores
the best knob values for all required operators. When the TVM compiler compiles
these operators, it will query this log file to get the best knob values.

We also released pre-tuned parameters for some arm devices. You can go to
`ARM CPU Benchmark <https://github.com/apache/incubator-tvm/wiki/Benchmark#arm-cpu>`_
to see the results.
"""

######################################################################

import os
import onnx
import numpy as np
import tvm
from tvm import autotvm
from tvm import relay
import tvm.relay.testing
from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
from tvm.contrib.util import tempdir
import tvm.contrib.graph_runtime as runtime
from tvm.contrib import util



model_name = "face_load_weight"
model_dir = '/home/bokyliu/dukto/fxp/AirFace/2d_facerecognition/20191119-1/test/%s.onnx' % model_name
input_name = "0"
#################################################################
# Define network
# --------------
# First we need to define the network in relay frontend API.
# We can load some pre-defined network from :code:`relay.testing`.
# We can also load models from MXNet, ONNX and TensorFlow.

def get_network(name, batch_size):
    """Get the symbol definition and random weight of a network"""
    input_shape = (batch_size, 3, 224, 224)
    output_shape = (batch_size, 1000)

    if "resnet" in name:
        n_layer = int(name.split('-')[1])
        mod, params = relay.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size, dtype=dtype)
    elif "vgg" in name:
        n_layer = int(name.split('-')[1])
        mod, params = relay.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size, dtype=dtype)
    elif name == 'mobilenet':
        mod, params = relay.testing.mobilenet.get_workload(batch_size=batch_size)
    elif name == 'squeezenet_v1.1':
        mod, params = relay.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1', dtype=dtype)
    elif name == 'inception_v3':
        input_shape = (1, 3, 299, 299)
        mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
    elif name == 'mxnet':
        # an example for mxnet model
        from mxnet.gluon.model_zoo.vision import get_model
        block = get_model('resnet18_v1', pretrained=True)
        mod, params = relay.frontend.from_mxnet(block, shape={'data': input_shape}, dtype=dtype)
        net = mod["main"]
        net = relay.Function(net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs)
        mod = relay.Module.from_expr(net)
    elif name == 'onnx':
        input_shape = (batch_size, 3, 112, 112)

        onnx_model = onnx.load(model_dir)
        shape_dict = {input_name: (1, 3, 112, 112)}
        output_shape = (1, 512)
        mod, params = relay.frontend.from_onnx(onnx_model, shape_dict, dtype="float32")
    else:
        raise ValueError("Unsupported network: " + name)

    return mod, params, input_shape, output_shape


#################################################################


###########################################
# Set Tuning Options
# ------------------
# Before tuning, we should apply some configurations. Here I use an RK3399 board
# as example. In your setting, you should modify the target and device_key accordingly.
# set :code:`use_android` to True if you use android phone.

#### DEVICE CONFIG ####

# Replace "aarch64-linux-gnu" with the correct target of your board.
# This target is used for cross compilation. You can query it by :code:`gcc -v` on your device.
target = tvm.target.create('llvm -device=arm_cpu -target=aarch64-linux-gnu')

# Also replace this with the device key in your tracker
device_key = 'tx2'

# Set this to True if you use android phone
use_android = False

#### TUNING OPTION ####
network = 'onnx'
log_file = "%s.%s.log" % (device_key, network)
dtype = 'float32'

tuning_option = {
    'log_filename': log_file,

    'tuner': 'xgb',
    'n_trial': 1500,
    'early_stopping': 800,
    'try_spatial_pack_depthwise': True,
    'measure_option': autotvm.measure_option(
        builder=autotvm.LocalBuilder(
            build_func='ndk' if use_android else 'default'),
        runner=autotvm.RPCRunner(
            device_key, host='0.0.0.0', port=9190,
            number=5,
            timeout=10,
        ),
    ),
}

num_threads = 4
os.environ["TVM_NUM_THREADS"] = str(num_threads)

####################################################################
#
# .. note:: How to set tuning options
#
#   In general, the default values provided here work well.
#   If you have enough time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,
#   which makes the tuning run longer.
#   If your device runs very slow or your conv2d operators have many GFLOPs, considering to
#   set timeout larger.
#
#   If your model has depthwise convolution, you could consider setting
#   :code:`try_spatial_pack_depthwise` be :code:`True`, which perform better than default
#   optimization in general. For example, on ARM CPU A53 2.0GHz, we find it could boost 1.6x
#   performance of depthwise convolution on Mobilenet V1 model.

###################################################################
# Begin Tuning
# ------------
# Now we can extract tuning tasks from the network and begin tuning.
# Here, we provide a simple utility function to tune a list of tasks.
# This function is just an initial implementation which tunes them in sequential order.
# We will introduce a more sophisticated tuning scheduler in the future.

# You can skip the implementation of this function for this tutorial.
def tune_tasks(tasks,
               measure_option,
               tuner='xgb',
               n_trial=1000,
               early_stopping=None,
               log_filename='tuning.log',
               use_transfer_learning=True,
               try_winograd=True,
               try_spatial_pack_depthwise=True):
    if try_winograd:
        for i in range(len(tasks)):
            try:  # try winograd template
                tsk = autotvm.task.create(tasks[i].name, tasks[i].args,
                                          tasks[i].target, tasks[i].target_host, 'winograd')
                input_channel = tsk.workload[1][1]
                if input_channel >= 64:
                    tasks[i] = tsk
            except Exception:
                pass

    # if we want to use spatial pack for depthwise convolution
    if try_spatial_pack_depthwise:
        tuner = 'xgb_knob'
        for i in range(len(tasks)):
            if tasks[i].name == 'topi_nn_depthwise_conv2d_nchw':
                tsk = autotvm.task.create(tasks[i].name, tasks[i].args,
                                          tasks[i].target, tasks[i].target_host,
                                          'contrib_spatial_pack')
                tasks[i] = tsk

    # create tmp log file
    tmp_log_file = log_filename + ".tmp"
    if os.path.exists(tmp_log_file):
        os.remove(tmp_log_file)

    for i, tsk in enumerate(reversed(tasks)):
        prefix = "[Task %2d/%2d] " % (i+1, len(tasks))

        # create tuner
        if tuner == 'xgb' or tuner == 'xgb-rank':
            tuner_obj = XGBTuner(tsk, loss_type='rank')
        elif tuner == 'xgb_knob':
            tuner_obj = XGBTuner(tsk, loss_type='rank', feature_type='knob')
        elif tuner == 'ga':
            tuner_obj = GATuner(tsk, pop_size=50)
        elif tuner == 'random':
            tuner_obj = RandomTuner(tsk)
        elif tuner == 'gridsearch':
            tuner_obj = GridSearchTuner(tsk)
        else:
            raise ValueError("Invalid tuner: " + tuner)

        if use_transfer_learning:
            if os.path.isfile(tmp_log_file):
                tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))

        # do tuning
        n_trial = min(n_trial, len(tsk.config_space))
        # n_trial = len(tsk.config_space)
        tuner_obj.tune(n_trial=n_trial,
                       early_stopping=early_stopping,
                       measure_option=measure_option,
                       callbacks=[
                           autotvm.callback.progress_bar(n_trial, prefix=prefix),
                           autotvm.callback.log_to_file(tmp_log_file)])

    # pick best records to a cache file
    autotvm.record.pick_best(tmp_log_file, log_filename)
    os.remove(tmp_log_file)


########################################################################
# Finally, we launch tuning jobs and evaluate the end-to-end performance.

def tune_and_evaluate(tuning_opt):
    # extract workloads from relay program
    print("Extract tasks...")
    mod, params, input_shape, outshape = get_network(network, batch_size=1)
    tasks = autotvm.task.extract_from_program(mod["main"], target=target,
                                              params=params,
                                              ops=(relay.op.nn.conv2d,))

    # run tuning tasks
    print("Tuning...")
    tune_tasks(tasks, **tuning_opt)

    # compile kernels with history best records
    with autotvm.apply_history_best(log_file):
        print("Compile...")
        with relay.build_config(opt_level=1):
            graph, lib, params = relay.build_module.build(
                mod, target=target, params=params)

        # export library
        lib_dir = '/home/bokyliu/Project/TVM/%s_tune_lib-fp32.tar' % model_name
        graph_dir = '/home/bokyliu/Project/TVM/%s_tune_graph-fp32.json' % model_name
        params_dir = '/home/bokyliu/Project/TVM/%s_tune_params-fp32' % model_name

        tmp = tempdir()
        if use_android:
            from tvm.contrib import ndk
            filename = "net.so"
            lib.export_library(tmp.relpath(filename), ndk.create_shared)
        else:
            filename = "net.tar"
            lib.export_library(lib_dir)



        temp = util.tempdir()
        with open(temp.relpath(graph_dir), "w") as fo:
            fo.write(graph)
        with open(temp.relpath(params_dir), "wb") as fo:
            fo.write(relay.save_param_dict(params))


        # upload module to device
        print("Upload...")
        remote = autotvm.measure.request_remote(device_key, '0.0.0.0', 9190,
                                                timeout=10000)
        # remote.upload(tmp.relpath(filename))
        # rlib = remote.load_module(filename)
        remote.upload(lib_dir)
        remote_tar = '%s_tune_lib-fp16.tar' % model_name
        rlib = remote.load_module(remote_tar)

        # upload parameters to device
        ctx = remote.context(str(target), 0)
        module = runtime.create(graph, rlib, ctx)
        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
        module.set_input('0', data_tvm)
        module.set_input(**params)
        module.run()
        out0 = module.get_output(0, tvm.nd.empty(outshape)).asnumpy()

        # test onnx output
        ctx = tvm.gpu()
        # create module
        module = runtime.create(graph, lib, ctx)
        # set input and parameters
        module.set_input("0", data_tvm)
        module.set_input(**params)
        # run
        module.run()
        # get output
        out1 = module.get_output(0, tvm.nd.empty(outshape)).asnumpy()

        tvm.testing.assert_allclose(out0, out1, atol=1e-3)

        # evaluate
        print("Evaluate inference time cost...")
        ftimer = module.module.time_evaluator("run", ctx, number=12, repeat=10)
        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
              (np.mean(prof_res), np.std(prof_res)))

# We do not run the tuning in our webpage server since it takes too long.
# Uncomment the following line to run it by yourself.

tune_and_evaluate(tuning_option)

######################################################################
# Sample Output
# -------------
# The tuning needs to compile many programs and extract feature from them.
# So a high performance CPU is recommended.
# One sample output is listed below.
# It takes about 2 hours on a 32T AMD Ryzen Threadripper.
#
# .. code-block:: bash
#
#    Extract tasks...
#    Tuning...
#    [Task  1/12]  Current/Best:   22.37/  52.19 GFLOPS | Progress: (544/1000) | 406.59 s Done.
#    [Task  2/12]  Current/Best:    6.51/  18.77 GFLOPS | Progress: (608/1000) | 325.05 s Done.
#    [Task  3/12]  Current/Best:    4.67/  24.87 GFLOPS | Progress: (480/1000) | 372.31 s Done.
#    [Task  4/12]  Current/Best:   11.35/  46.83 GFLOPS | Progress: (736/1000) | 602.39 s Done.
#    [Task  5/12]  Current/Best:    1.01/  19.80 GFLOPS | Progress: (448/1000) | 262.16 s Done.
#    [Task  6/12]  Current/Best:    2.47/  23.76 GFLOPS | Progress: (672/1000) | 563.85 s Done.
#    [Task  7/12]  Current/Best:   14.57/  33.97 GFLOPS | Progress: (544/1000) | 465.15 s Done.
#    [Task  8/12]  Current/Best:    1.13/  17.65 GFLOPS | Progress: (576/1000) | 365.08 s Done.
#    [Task  9/12]  Current/Best:   14.45/  22.66 GFLOPS | Progress: (928/1000) | 724.25 s Done.
#    [Task 10/12]  Current/Best:    3.22/  15.36 GFLOPS | Progress: (864/1000) | 564.27 s Done.
#    [Task 11/12]  Current/Best:   11.03/  32.23 GFLOPS | Progress: (736/1000) | 635.15 s Done.
#    [Task 12/12]  Current/Best:    8.00/  21.65 GFLOPS | Progress: (1000/1000) | 1111.81 s Done.
#    Compile...
#    Upload...
#    Evaluate inference time cost...
#    Mean inference time (std dev): 162.59 ms (0.06 ms)

######################################################################
#
# .. note:: **Experiencing Difficulties?**
#
#   The auto tuning module is error-prone. If you always see " 0.00/ 0.00 GFLOPS",
#   then there must be something wrong.
#
#   First, make sure you set the correct configuration of your device.
#   Then, you can print debug information by adding these lines in the beginning
#   of the script. It will print every measurement result, where you can find useful
#   error messages.
#
#   .. code-block:: python
#
#      import logging
#      logging.getLogger('autotvm').setLevel(logging.DEBUG)
#
#   Finally, always feel free to ask our community for help on https://discuss.tvm.ai

由于本来就没有把TVM完全吃透，这个代码就是直接在教程上修改而来。
修改的主要内容：

将try_spatial_pack_depthwise置true
修改n_trail和early_stopping
保存优化的结果（教程里面这点很坑，人家优化了几十个小时的结果就让他保存在/tmp/***/下面，程序一退出就自动删除了。）

终端测试

自动优化结束后，再将优化后的graph、tar、json复制到Tx2上，用

import numpy as np
import tvm
from tvm.contrib import graph_runtime

path_lib = './100-net-fp16.tar'


loaded_json = open("./face_partial_tune_graph-fp16.json").read()
loaded_lib = tvm.module.load(path_lib)
loaded_params = bytearray(open('./face_partial_tune_params-fp16', 'rb').read())
input_data = tvm.nd.array(np.random.uniform(size=(1, 3, 112, 112)).astype('float32'))

input_name = "0" # ??graph?????
ctx = tvm.cpu()
module = graph_runtime.create(loaded_json, loaded_lib, ctx)
module.set_input(input_name, input_data)
# module.set_input(**loaded_params)
module.load_params(loaded_params)

# evaluate
print("Evaluate inference time cost...")
ftimer = module.module.time_evaluator("run", ctx, number=100, repeat=3)
prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
      (np.mean(prof_res), np.std(prof_res)))

生成.so文件，并且计算一下推理耗时。
接下来建议看看优化后的结果是否跟原始模型有较大区别，这里我也提供一份代码：

import numpy as np
import tvm
import tvm.relay as relay
from tvm.contrib import graph_runtime
import torch
# import cv2 as cv
 
test_json = '/home/face/tvm_cpp/modelFolder/face_partial_tune_graph-fp16-load.json'
test_lib = '/home/face/tvm_cpp/modelFolder/100-net-fp16-load.tar.so'
test_param = '/home/face/tvm_cpp/modelFolder/face_partial_tune_params-fp16-load'
 
loaded_json = open(test_json).read()
loaded_lib = tvm.module.load(test_lib)
loaded_params = bytearray(open(test_param, "rb").read())
 
def preprocess(img_src):
  img_src= cv.cvtColor(img_src, cv.COLOR_BGR2RGB)
  img_src= cv.resize(img_src, (112, 112))
  input_data = np.array(img_src).astype(np.float32)
  input_data = input_data / 255.0
  input_data = np.transpose(input_data, (2, 0, 1))
  input_data[0] = (input_data[0] - 0.5)/ 0.5
  input_data[1] = (input_data[1] - 0.5)/ 0.5
  input_data[2] = (input_data[2] - 0.5)/ 0.5
  input_data = input_data[np.newaxis, :].copy()
  return input_data
 
# img = cv.imread("/home/face/anna/164_2.jpg")
# img_input = preprocess(img)
 
ctx = tvm.cpu(0)
module = graph_runtime.create(loaded_json, loaded_lib, ctx)
module.load_params(loaded_params)

tempimg0 = torch.ones(1, 3, 112, 112)
# run the module
module.set_input("0", tempimg0)
module.run()
out_deploy = module.get_output(0).asnumpy()
 
print(out_deploy)

在这里没出意外，计算结果跟torch的误差很小，接下来可以着手c++部署了。在尝试c++部署的时候还是走了跟多弯路的，主要是可参考的资料太少，话不多说上代码：
CMakeLists.txt

cmake_minimum_required(VERSION 2.8.12)

project(tvm_cpp)

set(CMAKE_INCLUDE_CURRENT_DIR ON)
set(CMAKE_AUTOMOC ON)

find_package(Qt5Core)
set(OpenCV_DIR /home/face/addition/opencv-3.4.2/build)
find_package (OpenCV REQUIRED)
if(OpenCV_FOUND)
    include_directories(${OpenCV_INCLUDE_DIRS})
    message(STATUS "OpenCV library status:")
    message(STATUS "    version: ${OpenCV_VERSION}")
    message(STATUS "    libraries: ${OpenCV_LIBS}")
    message(STATUS "    include path: ${OpenCV_INCLUDE_DIRS}")
endif()

add_executable(${PROJECT_NAME} "main.cpp")

target_link_libraries(${PROJECT_NAME} Qt5::Core)

INCLUDE_DIRECTORIES("~/tvm/include")
INCLUDE_DIRECTORIES("~/tvm/3rdparty/dlpack/include")
INCLUDE_DIRECTORIES("~/tvm/3rdparty/dmlc-core/include")

target_link_libraries(tvm_cpp "~/tvm/build/libtvm.so"
    "~/tvm/build/libtvm_runtime.so"
   ${OpenCV_LIBS}
        )

main.cpp

#include <QCoreApplication>
#include <dlpack/dlpack.h>
#include <tvm/runtime/module.h>
#include <tvm/runtime/registry.h>
#include <tvm/runtime/packed_func.h>
#include <opencv2/opencv.hpp>
#include <algorithm>
#include <fstream>
#include <iterator>
#include <stdexcept>
#include <string>
#include <opencv2/dnn/dnn.hpp>
#include <dirent.h>

int find_dir_file(std::string dir_name, std::vector<std::string> &v) //文件夹地址，文件列表
{
    DIR *dirp;
    struct dirent *dp;
    std::vector<std::string> first;
    dirp = opendir(dir_name.c_str());
    while ((dp = readdir(dirp)) != NULL)
    {
        //跳过'.'和'..'两个目录
        if (dp->d_name[0] == '.')
            continue;
        first.push_back(dp->d_name);
    }
    (void)closedir(dirp);
    std::cout << "first.size = " << first.size() << std::endl;
    //子目录搜索
    std::vector<std::string> sec;
    for (int i = 0; i < first.size(); i++)
    {
        std::string second = dir_name + "/" + first[i];
        // cout<<"second = "<<second<<endl;
        dirp = opendir(second.c_str());
        while ((dp = readdir(dirp)) != NULL)
        {
            //跳过'.'和'..'两个目录
            if (dp->d_name[0] == '.')
                continue;
            std::string save = second + "/" + dp->d_name;
            sec.push_back(save);
        }
        (void)closedir(dirp);
    }
    std::cout << "sec.size = " << sec.size() << std::endl;

    //子子目录搜索
    std::cout<<sec[0]<<std::endl;
    std::cout<<sec[1]<<std::endl;
    std::vector<std::string> trd;
    for (int i = 0; i < sec.size(); i++)
    {
        std::string third = sec[i];
        // cout << third << endl;
        dirp = opendir(third.c_str());
        while ((dp = readdir(dirp)) != NULL)
        {
            //跳过'.'和'..'两个目录
            if (dp->d_name[0] == '.')
                continue;
            std::string save = third + "/" + dp->d_name;
            v.push_back(save);
        }
        (void)closedir(dirp);
    }

    return 0;
}

void Mat_to_CHW(float *data, cv::Mat &frame)
{
    assert(data && !frame.empty());
    unsigned int volChl = 112 * 112;

    for(int c = 0; c < 3; ++c)
    {
        for (unsigned j = 0; j < volChl; ++j)
            data[c*volChl + j] = static_cast<float>(float(frame.data[j * 3 + c]) / 255.0);
    }

}

int main(int argc, char *argv[])
{
    QCoreApplication a(argc, argv);

     std::vector<std::string> v;
     find_dir_file("/home/face/kaoqin_112/", v);
     int num = v.size();
     std::cout << "total img num = " << num << std::endl;


    // tvm module for compiled functions
    tvm::runtime::Module mod_syslib = tvm::runtime::Module::LoadFromFile("../modelFolder/100-net-fp16-load.tar.so");
    //load graph
    std::ifstream json_in("../modelFolder/face_partial_tune_graph-fp16-load.json");
    std::string json_data((std::istreambuf_iterator<char>(json_in)), std::istreambuf_iterator<char>());
    json_in.close();

    // parameters in binary
    std::ifstream params_in("../modelFolder/face_partial_tune_params-fp16-load", std::ios::binary);
    std::string params_data((std::istreambuf_iterator<char>(params_in)), std::istreambuf_iterator<char>());
    params_in.close();

    // parameters need to be TVMByteArray type to indicate the binary data
    TVMByteArray params_arr;
    params_arr.data = params_data.c_str();
    params_arr.size = params_data.length();

    int dtype_code = kDLFloat;
    int dtype_bits = 32;
    int dtype_lanes = 1;
    int device_type = kDLCPU;
    int device_id = 0;

    // get global function module for graph runtime
    tvm::runtime::Module mod = (*tvm::runtime::Registry::Get("tvm.graph_runtime.create"))(json_data, mod_syslib, device_type, device_id);

    DLTensor* x;
    int in_ndim = 4;
    int64_t in_shape[4] = {1, 3, 112, 112};
    TVMArrayAlloc(in_shape, in_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &x);

    // create csv
    std::ofstream rgbData;
    rgbData.open("FeatureData.csv",std::ios::out | std::ios::trunc);

    // load image from cv mat
    float avg_time = 0;
    float totaltime = 0;
    for(int i=0; i<v.size(); i++)
    {
        cv::Mat tensor = cv::imread(v[i]);
        if(tensor.empty())
            continue;
        cv::cvtColor(tensor,tensor, cv::COLOR_BGR2RGB);
        float testinput[112*112*3];

        Mat_to_CHW(testinput, tensor);

        int size = sizeof(float32_t);
        memcpy(x->data, &testinput, 3 * 112 * 112 * size);

        // get the function from the module(set input data)
        tvm::runtime::PackedFunc set_input = mod.GetFunction("set_input");
        set_input("0", x);

        // get the function from the module(load patameters)
        tvm::runtime::PackedFunc load_params = mod.GetFunction("load_params");
        load_params(params_arr);

        // get the function from the module(run it)
        tvm::runtime::PackedFunc run = mod.GetFunction("run");
        for(int j=0; j<1; j++)
        {
            double t = (double)cv::getTickCount();
            run();
            float timeuse = ((double)cv::getTickCount() - t)/ cv::getTickFrequency();
            if(i!=0)
            {
                totaltime+=timeuse;
                avg_time = totaltime/(float)i;
            }
            std::cout<<v[i]<<" time: "<< timeuse <<"averge time: "<<avg_time<<std::endl;
        }

        tvm::runtime::PackedFunc get_output = mod.GetFunction("get_output");
        tvm::runtime::NDArray res = get_output(0);
        float *p_res = (float *)res->data;
        std::vector<float> f1;
        float ssum=0;
        for(int j=0; j<512; j++)
        {
            ssum += p_res[j]*p_res[j];
        }
        ssum = sqrt(ssum);
        for(int j=0; j<512; j++)
        {
            f1.push_back(p_res[j]/ssum);
        }
        rgbData<<v[i]<<",";
        for(int j=0; j<512; j++)
        {
            rgbData<<f1[j]<<",";
        }
        rgbData<<std::endl;
    }

    rgbData.close();

    TVMArrayFree(x);

    return 0;
}