https://www.ibiblio.org/e-notes/webgl/gpu/mul/sgemm2.htm
const CSs = `#version 310 es
#define TS 32u
layout (local_size_x = TS, local_size_y = TS, local_size_z = 1) in;
layout (std430, binding = 0) readonly buffer ssbA {
float A[];
};
layout (std430, binding = 1) readonly buffer ssbB {
float B[];
};
layout (std430, binding = 2) writeonly buffer ssbC {
float C[];
};
uniform uvec3 MNK;
shared float Asub[TS][TS]; // Local memory to fit a tile of
shared float Bsub[TS][TS]; // TS*TS elements of A and B
void main() {
uint M = MNK.x, N = MNK.y, K = MNK.z;
// Thread identifiers
uint row = gl_LocalInvocationID.x; // Local row ID (max: TS)
uint col = gl_LocalInvocationID.y; // Local col ID (max: TS)
uint globalRow = TS*gl_WorkGroupID.x + row; // Row ID of C (0..M)
uint globalCol = TS*gl_WorkGroupID.y + col; // Col ID of C (0..N)
// Initialise the accumulation register
float acc = 0.0;
// Loop over all tiles
uint numTiles = K/TS;
for (uint t=0u; t < numTiles; t++) {
// Load one tile of A and B into local memory
uint tiledRow = TS*t + row;
uint tiledCol = TS*t + col;
Asub[col][row] = A[tiledCol*M + globalRow];
Bsub[col][row] = B[globalCol*K + tiledRow];
// Synchronise to make sure the tile is loaded
memoryBarrierShared();
barrier();
// Perform the computation for a single tile
for (uint k=0u; k < TS; k++) {
acc += Asub[k][row] * Bsub[col][k];
}
// Synchronise before loading the next tile
barrier();
}
// Store the final result in C
C[globalCol*M + globalRow] = acc;
}