- 正文前感谢昇腾各位工作人员,没有你们的辛勤就没有我们的进步
- 本文立意交流大赛InstanceNorm算子编译过程
- 这道题难点在于如何实现算法公式,以及其中广播逻辑
- 本次实现基于0xcccccccc大神代码,受益匪浅,感谢并致敬0xcccccccc
- 首先来看什么是InstanceNorm
- 概念可以参考[各种归一化层(BatchNorm、LayerNorm、InstanceNorm、GroupNorm、Weight Standardization)及其Pytorch实现-CSDN博客](https://blog.csdn.net/qq_23981335/article/details/106572171)
- BatchNorm:batch方向做归一化,算NHW的均值
- LayerNorm:channel方向做归一化,算CHW的均值
- InstanceNorm:一个channel内做归一化,算H*W的均值
- GroupNorm:将channel方向分group,然后每个group内做归一化,算(C//G)HW的均值
- 公式如下,需要求平均数,方差
- 细化公式如下
- 对于NDHWC,NCDHW,NHWC,NCHW,ND如何实现tilling划分
- 总结来说利用 i, j ,k 分别对应maxbatchSize,maxstepSize,maxsquareSize三个变量
maxbatchSize | maxstepSize | maxsquareSize | |
NDHWC | N | C | DHW |
NCDHW | NC | 1 | DHW |
NHWC | N | C | HW |
NCHW | NC | 1 | HW |
ND | dim[0] * dim[1] | 1 | total/batchSize |
- 因此compute可以用以下代码表述(代码源于0xcccccccc,做了部分修改)
- Gm_variance.SetValue GlobalTensor.setValue报错
__aicore__ inline void Process() {
for (uint64_t i = 0; i < maxbatchSize; ++i) {
for (uint64_t j = 0; j < maxstepSize; ++j) {
float sum = 0.0;
for (uint64_t k = 0; k < maxsquareSize; ++k) {
float val = Gm_x.GetValue(i * maxsquareSize * maxstepSize + k * maxstepSize + j);
sum += val;
}
float avg = sum / maxsquareSize;
Gm_mean(i * maxstepSize + j) = (T)avg;
// Gm_mean.SetValue(i * maxstepSize + j, (T)avg);
}
}
for (uint64_t i = 0; i < maxbatchSize; ++i) {
for (uint64_t j = 0; j < maxstepSize; ++j) {
float avg = Gm_mean.GetValue(i * maxstepSize + j);
float sum = 0.0;
for (uint64_t k = 0; k < maxsquareSize; ++k) {
float val = Gm_x.GetValue(i * maxsquareSize * maxstepSize + k * maxstepSize + j);
sum += (val - avg) * (val - avg);
}
float var = sum / maxsquareSize;
Gm_variance(i * maxstepSize + j) = (T)var;
// Gm_variance.SetValue(i * maxstepSize + j, (T)var);
}
}
for (uint64_t i = 0; i < maxbatchSize; ++i) {
for (uint64_t j = 0; j < maxstepSize; ++j) {
float mean = Gm_mean.GetValue(i * maxstepSize + j);
float variance = Gm_variance.GetValue(i * maxstepSize + j);
float sum = 0.0;
for (uint64_t k = 0; k < maxsquareSize; ++k) {
auto index = i * maxsquareSize * maxstepSize + k * maxstepSize + j;
float x = Gm_x.GetValue(index);
float gamma = Gm_gamma.GetValue(i % batchSize[1] * batchOffset[1] + k % squareSize[1] * stepSize[1] + j % stepSize[1]);
float beta = Gm_beta.GetValue(i % batchSize[2] * batchOffset[2] + k % squareSize[2] * stepSize[2] + j % stepSize[2]);
float result = gamma * ((x - mean) / sqrt(variance + epsilon)) + beta;
// Gm_y.SetValue(index, (T)result);
Gm_y(index) = (T)result;
}
}
}
}
复制
- 通过对i,j,k进行%操作可以实现广播效果
i % batchSize[2] * batchOffset[2] + k % squareSize[2] * stepSize[2] + j % stepSize[2]
复制
- 因为算子中OperatorDesc用到了dataFormat,epsilon,所以要在AclNNInvocation/inc/operator_desc.h中添加相应变量
struct OperatorDesc {
/**
* Constructor
*/
explicit OperatorDesc();
/**
* Destructor
*/
virtual ~OperatorDesc();
/**
* Add an input tensor description
* @param [in] dataType: data type
* @param [in] numDims: number of dims
* @param [in] dims: dims
* @param [in] format: format
* @return OperatorDesc
*/
OperatorDesc &AddInputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format);
/**
* Add an output tensor description
* @param [in] dataType: data type
* @param [in] numDims: number of dims
* @param [in] dims: dims
* @param [in] format: format
* @return OperatorDesc
*/
OperatorDesc &AddOutputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format);
std::string opType;
char* dataFormat;
double epsilon;
std::vector<aclTensorDesc *> inputDesc;
std::vector<aclTensorDesc *> outputDesc;
};
复制
- 算子调用时通过std::chrono::steady_clock::time_point可以实现时间输出
#include <chrono>
#include <set>
struct Timer {
std::chrono::steady_clock::time_point start;
Timer() : start(std::chrono::steady_clock::now()) {}
~Timer() {
auto finish = std::chrono::steady_clock::now();
auto runtime = std::chrono::duration_cast<std::chrono::microseconds>(finish - start).count();
std::cerr << "\033[1;31mRuntime: " << runtime / 1e6 << "s\033[0m" << std::endl;
}
};
复制
- 在gen_data.py中可以格式输出到某一文件,从而实现算子调用时读取文件自动识别算子格式
- 不需要手动进行修改main.cpp中的输入输出格式,优化流程
with open("./output/meta", "w") as fp:
if dtype == np.float32:
print("float32", file=fp)
else:
print("float16", file=fp)
print(data_format, file=fp)
for i in shape_x:
print(i, file=fp)
print("*", file=fp)
for i in shape_gamma:
print(i, file=fp)
print("*", file=fp)
for i in shape_beta:
print(i, file=fp)
print("*", file=fp)
print(epsilon, file=fp)
复制
- 在main.cpp中
std::fstream meta("../output/meta");
OperatorDesc opDesc;
meta >> dtype;
meta >> dataFormat;
opDesc.dataFormat = (char *)dataFormat.c_str();
std::vector<int64_t> shape_x;
while (meta >> length) {
if (length == "*") break;
shape_x.push_back(std::stoi(length));
}
std::vector<int64_t> shape_gamma;
while (meta >> length) {
if (length == "*") break;
shape_gamma.push_back(std::stoi(length));
}
std::vector<int64_t> shape_beta;
while (meta >> length) {
if (length == "*") break;
shape_beta.push_back(std::stoi(length));
}
std::vector<int64_t> shapeMean;
if (dataFormat == "NDHWC") {
shapeMean = {shape_x[0], shape_x[4]};
}
else if (dataFormat == "NCDHW") {
shapeMean = {shape_x[0], shape_x[1]};
}
else if (dataFormat == "NHWC") {
shapeMean = {shape_x[0], shape_x[3]};
}
else if (dataFormat == "NCHW") {
shapeMean = {shape_x[0], shape_x[1]};
}
else {
shapeMean = {shape_x[0], shape_x[1]};
}
meta >> opDesc.epsilon;
aclDataType dataType = (dtype == "float32" ? ACL_FLOAT : ACL_FLOAT16);
aclFormat format = ACL_FORMAT_ND;
复制
- 第一个测试案例
- 第二个测试案例
- 第三个测试案例
- 第四个测试案例
- 第五个测试案例
- 最后,有几个小问题我也不太理解,展示出来,如果有路过好心人,还望大家不吝赐教
- 问题一
- PackNumber为什么如此切分,取20位最高位,然后再保证能被整除的意义是什么?
uint64_t packNumber = rest / (maxtotalSize / maxbatchSize * 4);
for (int i = 0; i < 20; ++i) {//20位 取最高位
if ((packNumber >> i) > 1) {
packNumber &= ~(1 << i);
}
}
while (maxbatchSize % packNumber) {
packNumber >>= 1;
}
复制
- 问题二
-
GroupReduce中,factor右移去除尾0,number置位首1到尾全为1,这样的操作是什么作用?看不懂代码了
const int32_t factor = group_size / (group_size & -group_size);
int32_t number = (256 / SIZE) / factor;
number |= (number >> 1);
number |= (number >> 2);
number |= (number >> 4);
int32_t reduceCount = (number ^ (number >> 1)) * factor;
复制
- 希望有人能看懂,能够不吝赐教,感谢