作者:MaskRay
链接:https://www.zhihu.com/question/36628908/answer/68364663
来源:知乎
著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。
const int N = 10000;
int M;
#define REP(i, n) FOR(i, 0, n)
#define FOR(i, a, b) for (int i = (a); i < (b); i++)
void transpose(int a[][N], int b[][N])
{
REP(i, N)
REP(j, N)
b[j][i] = a[i][j];
}
void recursive_transpose(int a[][N], int b[][N], int i0, int j0, int i1, int j1)
{
int r = i1-i0, c = j1-j0;
if (r > M) {
recursive_transpose(a, b, i0, j0, i0+r/2, j1);
recursive_transpose(a, b, i0+r/2, j0, i1, j1);
} else if (c > M) {
recursive_transpose(a, b, i0, j0, i1, j0+c/2);
recursive_transpose(a, b, i0, j0+c/2, i1, j1);
} else
FOR(i, i0, i1)
FOR(j, j0, j1)
b[j][i] = a[i][j];
}
void loop_tiling_transpose(int a[][N], int b[][N])
{
for (int i = 0; i < N; i += M)
for (int j = 0; j < N; j += M)
for (int i1 = min(i+M, N), ii = i; ii < i1; ii++)
for (int j1 = min(j+M, N), jj = j; jj < j1; jj++)
b[j][i] = a[i][j];
}
int A[N][N], B[N][N];
int main()
{
REP(i, N)
REP(j, N)
A[i][j] = rand();
transpose(A, B); // cache
clock_t bgn = clock();
transpose(A, B);
printf("simple, time: %lf\n", double(clock()-bgn)/CLOCKS_PER_SEC);
int ticks[] = {200,300,400,500,600,700,800,900,0};
for (int i = 0; ticks[i]; i++) {
M = ticks[i];
bgn = clock();
recursive_transpose(A, B, 0, 0, N, N);
printf("recursive, M = %d, time: %lf\n", M, double(clock()-bgn)/CLOCKS_PER_SEC);
bgn = clock();
loop_tiling_transpose(A, B);
printf("loop tiling, M = %d, time: %lf\n", M, double(clock()-bgn)/CLOCKS_PER_SEC);
}
}