train_gpt2_fp32.cu - layernorm_forward_kernel3

109702008

于 2024-05-12 11:30:52 发布

阅读量645

点赞数 18

分类专栏：人工智能 # C语言文章标签：人工智能 c语言

本文链接：https://blog.csdn.net/eidolon_foot/article/details/138662871

版权

C语言同时被 2 个专栏收录

208 篇文章 0 订阅 ¥19.90 ¥99.00

订阅专栏

超级会员免费看

人工智能

249 篇文章 3 订阅

订阅专栏

源码

__global__ void layernorm_forward_kernel3(float* __restrict__ out, float* __restrict__ mean, float* __restrict__ rstd,
                                    const float*  __restrict__ inp, const float*  __restrict__ weight,
                                    const float* __restrict__ bias, int N, int C) {
    cg::thread_block block = cg::this_thread_block();
    cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block);
    int idx = blockIdx.x * warp.meta_group_size() + warp.meta_group_rank();
    if(idx >= N) {
        return;
    }

    // the row of input that this group of threads is responsible for
    const float* x = inp + idx * C;

    // mean
    float sum = 0.0f;
    for (int i =