昇腾AI原生创新算子挑战赛（S1赛季）复盘- FastGeluGrad算子

最新推荐文章于 2024-07-05 15:25:39 发布

william_myq

最新推荐文章于 2024-07-05 15:25:39 发布

阅读量274

点赞数 6

文章标签： AI-native

本文链接：https://blog.csdn.net/xiujiti6871/article/details/139365969

版权

正文前感谢昇腾各位工作人员，没有你们的辛勤就没有我们的进步
本文立意交流大赛FastGeluGrad算子编译过程
这道题是在FastGelu基础上的升级题目，嗯，不难，公式很吓人

算子要求fp16,fp32 ，计算类型符合所有的API公式限制，也就是说不需要类型转换
比FastGelu多了一个输入dy,两个输入，一个输出，并不需要大幅度的数据搬迁
唯一需要解决的就是compute的算法设计

  __aicore__ inline void Compute(int32_t progress) {
    LocalTensor<DTYPE_X> inLocal = inQueueIN.DeQue<DTYPE_X>();
    
    LocalTensor<DTYPE_DY> dyLocal = inLocal[0];
    LocalTensor<DTYPE_X> xLocal = inLocal[this->tileLength];
    LocalTensor<DTYPE_Z> outLocal = outQueueOUT.AllocTensor<DTYPE_Z>();
    LocalTensor<DTYPE_Z> tempTensor1 = calcBuf.Get<DTYPE_Z>();
    LocalTensor<DTYPE_Z> tempTensor2 = calcBuf1.Get<DTYPE_Z>();

    Abs(tempTensor1, xLocal, this->tileLength);
    Muls(tempTensor1, tempTensor1, (DTYPE_Z)(-1.702), this->tileLength);
    Exp(tempTensor1, tempTensor1, this->tileLength);//exp(-1.702abs(x))
    Adds(outLocal, tempTensor1, (DTYPE_Z)(1), this->tileLength);
    Mul(outLocal, outLocal, outLocal, this->tileLength);//分母

    Abs(tempTensor2, xLocal, this->tileLength);
    Sub(tempTensor2, xLocal, tempTensor2, this->tileLength);//x-abs(x)
    Muls(tempTensor2, tempTensor2, (DTYPE_Z)(1.702), this->tileLength);//1.702(x-abs(x))
    Exp(tempTensor2, tempTensor2, this->tileLength);//exp(1.702(x-abs(x)))
    Add(tempTensor2, tempTensor2, tempTensor1,this->tileLength);//exp(-1.702abs(x)) + exp(1.702(x-abs(x))) 
    Mul(tempTensor1, tempTensor1, xLocal, this->tileLength);//xexp(-1.702abs(x))
    Muls(tempTensor1, tempTensor1, (DTYPE_Z)(1.702), this->tileLength);//1.702xexp(-1.702abs(x))
    Add(tempTensor2, tempTensor2, tempTensor1,this->tileLength);//exp(-1.702abs(x)) + exp(1.702(x-abs(x)))  + 1.702xexp(-1.702abs(x)) 
    Mul(tempTensor2, dyLocal, tempTensor2, this->tileLength);
    Div(outLocal, tempTensor2, outLocal, this->tileLength);
    


    outQueueOUT.EnQue<DTYPE_Z>(outLocal);

    inQueueIN.FreeTensor(inLocal);
  }复制