昇腾AI原生创新算子挑战赛（S1赛季）复盘-InstanceNorm算子

最新推荐文章于 2024-09-11 11:24:46 发布

william_myq

最新推荐文章于 2024-09-11 11:24:46 发布

阅读量939

点赞数 23

文章标签： AI-native

本文链接：https://blog.csdn.net/xiujiti6871/article/details/139362890

版权

正文前感谢昇腾各位工作人员，没有你们的辛勤就没有我们的进步
本文立意交流大赛InstanceNorm算子编译过程
这道题难点在于如何实现算法公式，以及其中广播逻辑
本次实现基于0xcccccccc大神代码，受益匪浅，感谢并致敬0xcccccccc

首先来看什么是InstanceNorm
概念可以参考[各种归一化层（BatchNorm、LayerNorm、InstanceNorm、GroupNorm、Weight Standardization）及其Pytorch实现-CSDN博客](https://blog.csdn.net/qq_23981335/article/details/106572171)

BatchNorm：batch方向做归一化，算NHW的均值
LayerNorm：channel方向做归一化，算CHW的均值
InstanceNorm：一个channel内做归一化，算H*W的均值
GroupNorm：将channel方向分group，然后每个group内做归一化，算(C//G)HW的均值

公式如下，需要求平均数，方差

细化公式如下

对于NDHWC，NCDHW，NHWC，NCHW，ND如何实现tilling划分
总结来说利用 i, j ,k 分别对应maxbatchSize,maxstepSize,maxsquareSize三个变量

	maxbatchSize	maxstepSize	maxsquareSize
NDHWC	N	C	DHW
NCDHW	NC	1	DHW
NHWC	N	C	HW
NCHW	NC	1	HW
ND	dim[0] * dim[1]	1	total/batchSize

因此compute可以用以下代码表述（代码源于0xcccccccc，做了部分修改）
Gm_variance.SetValue GlobalTensor.setValue报错

 __aicore__ inline void Process() {
        for (uint64_t i = 0; i < maxbatchSize; ++i) {
            for (uint64_t j = 0; j < maxstepSize; ++j) {
                float sum = 0.0;
                for (uint64_t k = 0; k < maxsquareSize; ++k) {
                    float val = Gm_x.GetValue(i * maxsquareSize * maxstepSize + k * maxstepSize + j);
                    sum += val;
                }
                float avg = sum / maxsquareSize;
                Gm_mean(i * maxstepSize + j) =  (T)avg;
                // Gm_mean.SetValue(i * maxstepSize + j, (T)avg);
            }
        }
        for (uint64_t i = 0; i < maxbatchSize; ++i) {
            for (uint64_t j = 0; j < maxstepSize; ++j) {
                float avg = Gm_mean.GetValue(i * maxstepSize + j);
                float sum = 0.0;
                for (uint64_t k = 0; k < maxsquareSize; ++k) {
                    float val = Gm_x.GetValue(i * maxsquareSize * maxstepSize + k * maxstepSize + j);
                    sum += (val - avg) * (val - avg);
                }
                float var = sum / maxsquareSize;
                Gm_variance(i * maxstepSize + j) = (T)var;
                // Gm_variance.SetValue(i * maxstepSize + j, (T)var);
            }
        }
        for (uint64_t i = 0; i < maxbatchSize; ++i) {
            for (uint64_t j = 0; j < maxstepSize; ++j) {
                float mean = Gm_mean.GetValue(i * maxstepSize + j);
                float variance = Gm_variance.GetValue(i * maxstepSize + j);
                float sum = 0.0;
                for (uint64_t k = 0; k < maxsquareSize; ++k) {
                    auto index = i * maxsquareSize * maxstepSize + k * maxstepSize + j;
                    float x = Gm_x.GetValue(index);
                    float gamma = Gm_gamma.GetValue(i % batchSize[1] * batchOffset[1] + k % squareSize[1] * stepSize[1] + j % stepSize[1]);
                    float beta = Gm_beta.GetValue(i % batchSize[2] * batchOffset[2] + k % squareSize[2] * stepSize[2] + j % stepSize[2]);
                    float result = gamma * ((x - mean) / sqrt(variance + epsilon)) + beta;
                    // Gm_y.SetValue(index, (T)result);
                    Gm_y(index) = (T)result;
                }
            }
        }
    }复制

通过对i,j,k进行%操作可以实现广播效果

i % batchSize[2] * batchOffset[2] + k % squareSize[2] * stepSize[2] + j % stepSize[2]复制

因为算子中OperatorDesc用到了dataFormat，epsilon，所以要在AclNNInvocation/inc/operator_desc.h中添加相应变量

struct OperatorDesc {
    /**
     * Constructor
     */
    explicit OperatorDesc();

    /**
     * Destructor
     */
    virtual ~OperatorDesc();

    /**
     * Add an input tensor description
     * @param [in] dataType: data type
     * @param [in] numDims: number of dims
     * @param [in] dims: dims
     * @param [in] format: format
     * @return OperatorDesc
     */
    OperatorDesc &AddInputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format);

    /**
     * Add an output tensor description
     * @param [in] dataType: data type
     * @param [in] numDims: number of dims
     * @param [in] dims: dims
     * @param [in] format: format
     * @return OperatorDesc
     */
    OperatorDesc &AddOutputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format);

    std::string opType;
    char* dataFormat;
    double epsilon;
    std::vector<aclTensorDesc *> inputDesc;
    std::vector<aclTensorDesc *> outputDesc;
};复制

算子调用时通过std::chrono::steady_clock::time_point可以实现时间输出

#include <chrono>
#include <set>
struct Timer {
	std::chrono::steady_clock::time_point start;
	Timer() : start(std::chrono::steady_clock::now()) {}
	~Timer() {
		auto finish = std::chrono::steady_clock::now();
		auto runtime = std::chrono::duration_cast<std::chrono::microseconds>(finish - start).count();
		std::cerr << "\033[1;31mRuntime: " << runtime / 1e6 << "s\033[0m" << std::endl;
	}
};复制

在gen_data.py中可以格式输出到某一文件，从而实现算子调用时读取文件自动识别算子格式
不需要手动进行修改main.cpp中的输入输出格式，优化流程

    with open("./output/meta", "w") as fp:
        if dtype == np.float32:
            print("float32", file=fp)
        else:
            print("float16", file=fp)
        print(data_format, file=fp)
        for i in shape_x:
            print(i, file=fp)
        print("*", file=fp)
        for i in shape_gamma:
            print(i, file=fp)
        print("*", file=fp)
        for i in shape_beta:
            print(i, file=fp)
        print("*", file=fp)
        print(epsilon, file=fp)复制

在main.cpp中

    std::fstream meta("../output/meta");
    OperatorDesc opDesc;
    meta >> dtype;
    meta >> dataFormat;
    opDesc.dataFormat = (char *)dataFormat.c_str();
    std::vector<int64_t> shape_x;
    while (meta >> length) {
        if (length == "*") break;
        shape_x.push_back(std::stoi(length));
    }
    std::vector<int64_t> shape_gamma;
    while (meta >> length) {
        if (length == "*") break;
        shape_gamma.push_back(std::stoi(length));
    } 
    std::vector<int64_t> shape_beta;
    while (meta >> length) {
        if (length == "*") break;
        shape_beta.push_back(std::stoi(length));
    }  
    std::vector<int64_t> shapeMean;
    if (dataFormat == "NDHWC") {
        shapeMean = {shape_x[0], shape_x[4]};
    }
    else if (dataFormat == "NCDHW") {
        shapeMean = {shape_x[0], shape_x[1]};
    }
    else if (dataFormat == "NHWC") {
        shapeMean = {shape_x[0], shape_x[3]};
    }
    else if (dataFormat == "NCHW") {
        shapeMean = {shape_x[0], shape_x[1]};
    }
    else {
        shapeMean = {shape_x[0], shape_x[1]};
    }
    meta >> opDesc.epsilon;
    
    aclDataType dataType = (dtype == "float32" ? ACL_FLOAT : ACL_FLOAT16);
    aclFormat format = ACL_FORMAT_ND;复制

第一个测试案例

第二个测试案例

第三个测试案例

第四个测试案例

第五个测试案例

最后，有几个小问题我也不太理解，展示出来，如果有路过好心人，还望大家不吝赐教

问题一
PackNumber为什么如此切分，取20位最高位，然后再保证能被整除的意义是什么？

    uint64_t packNumber = rest / (maxtotalSize / maxbatchSize * 4);
    for (int i = 0; i < 20; ++i) {//20位 取最高位
        if ((packNumber >> i) > 1) {
            packNumber &= ~(1 << i);
        }
    }
    while (maxbatchSize % packNumber) {
        packNumber >>= 1;
    }复制

问题二
GroupReduce中，factor右移去除尾0，number置位首1到尾全为1，这样的操作是什么作用？看不懂代码了

 const int32_t factor = group_size / (group_size & -group_size);
    int32_t number = (256 / SIZE) / factor;
    number |= (number >> 1);
    number |= (number >> 2);
    number |= (number >> 4);
    int32_t reduceCount = (number ^ (number >> 1)) * factor;复制