#include<stdio.h>#include"cachelab.h"intis_transpose(int M,int N,int A[N][M],int B[M][N]);voidtranspose_32x32(int M,int N,int A[N][M],int B[M][N]);voidtranspose_64x64(int M,int N,int A[N][M],int B[M][N]);voidtranspose_61x67(int M,int N,int A[N][M],int B[M][N]);/*
* transpose_submit - This is the solution transpose function that you
* will be graded on for Part B of the assignment. Do not change
* the description string "Transpose submission", as the driver
* searches for that string to identify the transpose function to
* be graded.
*/char transpose_submit_desc[]="Transpose submission";voidtranspose_submit(int M,int N,int A[N][M],int B[M][N]){if(M ==32&& N ==32)transpose_32x32(M, N, A, B);if(M ==64&& N ==64)transpose_64x64(M, N, A, B);if(M ==61&& N ==67)transpose_61x67(M, N, A, B);}/*
* You can define additional transpose functions below. We've defined
* a simple one below to help you get started.
*/voidtranspose_32x32(int M,int N,int A[N][M],int B[M][N]){for(int i =0; i <32; i +=8)for(int j =0; j <32; j +=8)for(int k = i; k < i +8; k++){int a_0 = A[k][j];int a_1 = A[k][j +1];int a_2 = A[k][j +2];int a_3 = A[k][j +3];int a_4 = A[k][j +4];int a_5 = A[k][j +5];int a_6 = A[k][j +6];int a_7 = A[k][j +7];
B[j][k]= a_0;
B[j +1][k]= a_1;
B[j +2][k]= a_2;
B[j +3][k]= a_3;
B[j +4][k]= a_4;
B[j +5][k]= a_5;
B[j +6][k]= a_6;
B[j +7][k]= a_7;}}voidtranspose_64x64(int M,int N,int A[N][M],int B[M][N]){int a_0, a_1, a_2, a_3, a_4, a_5, a_6, a_7;for(int i =0; i <64; i +=8){for(int j =0; j <64; j +=8){for(int k = i; k < i +4; k++){
a_0 = A[k][j +0];
a_1 = A[k][j +1];
a_2 = A[k][j +2];
a_3 = A[k][j +3];
a_4 = A[k][j +4];
a_5 = A[k][j +5];
a_6 = A[k][j +6];
a_7 = A[k][j +7];
B[j +0][k]= a_0;
B[j +1][k]= a_1;
B[j +2][k]= a_2;
B[j +3][k]= a_3;
B[j +0][k +4]= a_4;
B[j +1][k +4]= a_5;
B[j +2][k +4]= a_6;
B[j +3][k +4]= a_7;}for(int k = j; k < j +4; k++){
a_0 = B[k][i +4];
a_1 = B[k][i +5];
a_2 = B[k][i +6];
a_3 = B[k][i +7];
a_4 = A[i +4][k];
a_5 = A[i +5][k];
a_6 = A[i +6][k];
a_7 = A[i +7][k];
B[k][i +4]= a_4;
B[k][i +5]= a_5;
B[k][i +6]= a_6;
B[k][i +7]= a_7;
B[k +4][i +0]= a_0;
B[k +4][i +1]= a_1;
B[k +4][i +2]= a_2;
B[k +4][i +3]= a_3;}for(int k = i +4; k < i +8; k++){
a_4 = A[k][j +4];
a_5 = A[k][j +5];
a_6 = A[k][j +6];
a_7 = A[k][j +7];
B[j +4][k]= a_4;
B[j +5][k]= a_5;
B[j +6][k]= a_6;
B[j +7][k]= a_7;}}}}voidtranspose_61x67(int M,int N,int A[N][M],int B[M][N]){for(int i =0; i < N; i +=16)for(int j =0; j < M; j +=16)for(int k = i; k < i +16&& k < N; k++)for(int s = j; s < j +16&& s < M; s++)
B[s][k]= A[k][s];}/*
* trans - A simple baseline transpose function, not optimized for the cache.
*/char trans_desc[]="Simple row-wise scan transpose";voidtrans(int M,int N,int A[N][M],int B[M][N]){int i, j, tmp;for(i =0; i < N; i++){for(j =0; j < M; j++){
tmp = A[i][j];
B[j][i]= tmp;}}}/*
* registerFunctions - This function registers your transpose
* functions with the driver. At runtime, the driver will
* evaluate each of the registered functions and summarize their
* performance. This is a handy way to experiment with different
* transpose strategies.
*/voidregisterFunctions(){/* Register your solution function */registerTransFunction(transpose_submit, transpose_submit_desc);/* Register any additional transpose functions */registerTransFunction(trans, trans_desc);}/*
* is_transpose - This helper function checks if B is the transpose of
* A. You can check the correctness of your transpose by calling
* it before returning from the transpose function.
*/intis_transpose(int M,int N,int A[N][M],int B[M][N]){int i, j;for(i =0; i < N; i++){for(j =0; j < M;++j){if(A[i][j]!= B[j][i]){return0;}}}return1;}
在此这次实验中,我学到了很多有关缓存的知识,包括缓存映射、缓存行大小、缓存替换策略、预取和缓存对齐等方面。同时,我也意识到了程序中一些微小的改变可能会对性能产生重大影响,例如对数组访问的顺序进行重新排列、使用局部变量缓存数据等。cache优化的表现在这个lab里可以说是很大了,我也明白了cache的实际的样子:hash+hash ,以及LRU的使用,当然还有一个很重要的思想:就是不断尝试,在一个方法的结果没有出现之前不能着急否定它,除非有确切的证明能说明它不会更好,那么这个新方法都是应该琢磨一下的,并且要模拟和分析它带来的影响,最好写出代码跑出结果。这一个lab我最大的收获是获得了优化矩阵的能力哈哈,以前在做算法题的时候我最怕的就是矩阵的,因为我的空间想象能力不是很好,很多算法知道但是就是很难想到矩阵和程序中访问下标对应的样子,这回这个lab中我算是明白了矩阵分析首先去模拟+画图是一个很好的分析方法。这个实验的 Part A 让我对缓存的设计有了更深入的理解,其中替换策略也值得以后继续研究;Part B 为我展示了计算机之美,一个简简单单的转置函数,无论怎么写,时间复杂度都是 O(n的二次方),然而因为缓冲区的问题,不同代码的性能竟然有着天壤之别。编写函数过程中,对miss的估量与计算很烧脑,但也很有趣。