矩阵乘法(测试)

最新推荐文章于 2020-11-29 07:29:03 发布

makefish

最新推荐文章于 2020-11-29 07:29:03 发布

阅读量330

点赞数 1

本文链接：https://blog.csdn.net/makefish/article/details/88906526

版权

https://www.ibiblio.org/e-notes/webgl/gpu/mul/sgemm2.htm

const CSs = `#version 310 es
#define TS 32u
layout (local_size_x = TS, local_size_y = TS, local_size_z = 1) in;
layout (std430, binding = 0) readonly buffer ssbA {
  float A[];
};
layout (std430, binding = 1) readonly buffer ssbB {
  float B[];
};
layout (std430, binding = 2) writeonly buffer ssbC {
  float C[];
};
  uniform uvec3 MNK;
  shared float Asub[TS][TS];  // Local memory to fit a tile of
  shared float Bsub[TS][TS];  // TS*TS elements of A and B
void main() {
    uint M = MNK.x, N = MNK.y, K = MNK.z;

    // Thread identifiers
    uint row = gl_LocalInvocationID.x; // Local row ID (max: TS)
    uint col = gl_LocalInvocationID.y; // Local col ID (max: TS)
    uint globalRow = TS*gl_WorkGroupID.x + row; // Row ID of C (0..M)
    uint globalCol = TS*gl_WorkGroupID.y + col; // Col ID of C (0..N)

    // Initialise the accumulation register
    float acc = 0.0;

    // Loop over all tiles
    uint numTiles = K/TS;
    for (uint t=0u; t < numTiles; t++) {

        // Load one tile of A and B into local memory
        uint tiledRow = TS*t + row;
        uint tiledCol = TS*t + col;
        Asub[col][row] = A[tiledCol*M + globalRow];
        Bsub[col][row] = B[globalCol*K + tiledRow];

        // Synchronise to make sure the tile is loaded
        memoryBarrierShared();
        barrier();

        // Perform the computation for a single tile
        for (uint k=0u; k < TS; k++) {
            acc += Asub[k][row] * Bsub[col][k];
        }

        // Synchronise before loading the next tile
        barrier();
    }
    // Store the final result in C
    C[globalCol*M + globalRow] = acc;
}