矩阵乘法(测试)

https://www.ibiblio.org/e-notes/webgl/gpu/mul/sgemm2.htm

const CSs = `#version 310 es
#define TS 32u
layout (local_size_x = TS, local_size_y = TS, local_size_z = 1) in;
layout (std430, binding = 0) readonly buffer ssbA {
  float A[];
};
layout (std430, binding = 1) readonly buffer ssbB {
  float B[];
};
layout (std430, binding = 2) writeonly buffer ssbC {
  float C[];
};
  uniform uvec3 MNK;
  shared float Asub[TS][TS];  // Local memory to fit a tile of
  shared float Bsub[TS][TS];  // TS*TS elements of A and B
void main() {
    uint M = MNK.x, N = MNK.y, K = MNK.z;

    // Thread identifiers
    uint row = gl_LocalInvocationID.x; // Local row ID (max: TS)
    uint col = gl_LocalInvocationID.y; // Local col ID (max: TS)
    uint globalRow = TS*gl_WorkGroupID.x + row; // Row ID of C (0..M)
    uint globalCol = TS*gl_WorkGroupID.y + col; // Col ID of C (0..N)

    // Initialise the accumulation register
    float acc = 0.0;

    // Loop over all tiles
    uint numTiles = K/TS;
    for (uint t=0u; t < numTiles; t++) {

        // Load one tile of A and B into local memory
        uint tiledRow = TS*t + row;
        uint tiledCol = TS*t + col;
        Asub[col][row] = A[tiledCol*M + globalRow];
        Bsub[col][row] = B[globalCol*K + tiledRow];

        // Synchronise to make sure the tile is loaded
        memoryBarrierShared();
        barrier();

        // Perform the computation for a single tile
        for (uint k=0u; k < TS; k++) {
            acc += Asub[k][row] * Bsub[col][k];
        }

        // Synchronise before loading the next tile
        barrier();
    }
    // Store the final result in C
    C[globalCol*M + globalRow] = acc;
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值