实验梗概
这个实验分为Part A和Part B。在Part A里,你要编写一个cache模拟器。在Part B里,你要编写矩阵转置函数。资料:
CMU2015年的cache lab讲义
大佬的lab总结((;´д`)ゞ我太菜了,都是借鉴这位大佬的总结)
#Part A
编写cache模拟器:思路:
要使用getopt函数解析命令行: 上面的讲义有写:跳过
上面贴的cache lab讲义里有一些hint,根据这个,就可以大致写出来
定义cache_line的结构,其中valid_bits是有效位,tag是标记位,stamp是时间戳
typedef struct{
int valid_bits;
unsigned tag;
int stamp;
}cache_line;
定义一个cache[S][E]大小的二维数组(using malloc). 这样cache就模拟好了
下面是cache运行模拟:读入要访问的地址,再在cache里查找是否存在。
handle里有讲I是不用关的。所有现在focus on 其他操作
operator | what | do what |
---|---|---|
I | instruction load | |
L | data load | 读access cache |
S | data store | 写access cache |
M | data modify | 又读又写 access cache两次 |
所以M直接fall through
while(fscanf(file," %c %x,%d",&operation,&address,&size)>0){
switch(operation){
case 'L':
update(address);
break;
case 'M':
update(address);
case 'S':
update(address);
break;
}
time();
}
在handle里,block offset是不用关的,所以当得到一个地址时,按照上图的方式的到tag和set就行(分别对应了t_address和s_address)
然后就是正常的cache访问了,如果hit cache,hit次数加一。如果cache没有,就要用下一层缓冲中取出.这就是miss。如果当时的组中,每行都被使用,就要是驱逐evicting
void update(unsigned address){
unsigned s_address =(address>>b) & ((0xffffffff)>>(32-s));//set`s index
unsigned t_address = address>>(s+b); //tag`s index
for(int i=0;i<E;i++){
if((*(cache+s_address)+i)->tag ==t_address){
cache[s_address][i].stamp = 0; //now ,this is used
hit++;
return;
}
}
for(int i=0;i<E;i++){
if(cache[s_address][i].valid_bits == 0){
cache[s_address][i].tag = t_address;
cache[s_address][i].valid_bits = 1;
cache[s_address][i].stamp = 0; //now ,this is load
miss++;
return;
}
}
int max_stamp=0;
int max_i;
for(int i=0;i<E;i++){
if(cache[s_address][i].stamp >max_stamp){
max_stamp = cache[s_address][i].stamp;
max_i = i;
}
}
eviction++;
miss++;
cache[s_address][max_i].tag = t_address;
cache[s_address][max_i].stamp = 0;
}
最后说说时间戳:
每次访问一次cache,都会执行time函数,也就是把stamp的值加一。如果当前cache访问中,地址命中,则该stamp置为0,代表被使用。
完整代码:
#include "cachelab.h"
#include <getopt.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdio.h>
#include <stddef.h>
typedef struct{
int valid_bits;
unsigned tag;
int stamp;
}cache_line;
char* filepath = NULL;
int s,E,b,S; // s is set ,E is line,each line have 2^b bits ,S is 2^s set
int hit=0,miss=0,eviction=0;
cache_line** cache = NULL;
void init(){
cache = (cache_line**)malloc(sizeof(cache_line*)*S); //malloc cache[S][E]
for(int i=0;i<S;i++)
*(cache+i) = (cache_line*)malloc(sizeof(cache_line)*E);
for(int i=0;i<S;i++){
for(int j=0;j<E;j++){
cache[i][j].valid_bits = 0; // set all valid_bits is zero
cache[i][j].tag = 0xffffffff; //no address
cache[i][j].stamp = 0; //time is 0;
}
}
}
void update(unsigned address){
unsigned s_address =(address>>b) & ((0xffffffff)>>(32-s)); //set`s index
unsigned t_address = address>>(s+b); //tag`s index
for(int i=0;i<E;i++){
if((*(cache+s_address)+i)->tag ==t_address){
cache[s_address][i].stamp = 0; //now ,this is used
hit++;
return;
}
}
for(int i=0;i<E;i++){
if(cache[s_address][i].valid_bits == 0){
cache[s_address][i].tag = t_address;
cache[s_address][i].valid_bits = 1;
cache[s_address][i].stamp = 0; //now ,this is load
miss++;
return;
}
}
int max_stamp=0;
int max_i;
for(int i=0;i<E;i++){
if(cache[s_address][i].stamp >max_stamp){
max_stamp = cache[s_address][i].stamp;
max_i = i;
}
}
eviction++;
miss++;
cache[s_address][max_i].tag = t_address;
cache[s_address][max_i].stamp = 0;
}
void time(){
for(int i=0;i<S;i++){
for(int j=0;j<E;j++){
if(cache[i][j].valid_bits == 1)
cache[i][j].stamp++;
}
}
}
int main(int argc,char *argv[])
{
int opt;
while((opt = getopt(argc,argv,"s:E:b:t:")) !=-1){ //parse command line arguments
switch(opt){
case 's':
s=atoi(optarg);
break;
case 'E':
E=atoi(optarg);
break;
case 'b':
b=atoi(optarg);
break;
case 't':
filepath = optarg;
break;
}
}
S = 1<<s;
init();
FILE* file=fopen(filepath,"r");
if(file == NULL){ // read trace file
printf("Open file wrong");
exit(-1);
}
char operation;
unsigned address;
int size;
while(fscanf(file," %c %x,%d",&operation,&address,&size)>0){
switch(operation){
case 'L':
update(address);
break;
case 'M':
update(address);
case 'S':
update(address);
break;
}
time();
}
for(int i=0;i<S;i++) //free cache[S][E]
free(*(cache+i));
free(cache);
fclose(file); //close file
printSummary(hit,miss,eviction);
return 0;
}
Part B
要求
编写矩阵转置函数,要求cache miss最小,分别会用这三种矩阵来测试:
$32*32 $,cache miss
<
600
<600
<600
64
∗
64
64*64
64∗64,cache miss
<
2000
<2000
<2000
61
∗
67
61*67
61∗67,cache miss
<
3000
<3000
<3000
测试的cache是
2
5
2^{5}
25组,一行,block size为
2
5
2^{5}
25bytes
强烈要看上面讲义:讲义里讲的就是这个Part的做法.
我的做法
简单来说:就是分块。
我没有分析,想看分析的可以看我上面贴的大佬的分析.(因为我也不会>﹏<)
32*32
分为8*8块。
char transpose1[] = "Transpose 1";
void transpose11(int M, int N, int A[N][M], int B[M][N])
{
int i,j;
int i1;
int val1,val2,val3,val4,val5,val6,val7,val8;
for(i=0;i<32;i+=8){
for(j=0;j<32;j+=8){
for(i1=i;i1<i+8;i1++){
val1 = A[i1][j+0];
val2 = A[i1][j+1];
val3 = A[i1][j+2];
val4 = A[i1][j+3];
val5 = A[i1][j+4];
val6 = A[i1][j+5];
val7 = A[i1][j+6];
val8 = A[i1][j+7];
B[j+0][i1] = val1;
B[j+1][i1] = val2;
B[j+2][i1] = val3;
B[j+3][i1] = val4;
B[j+4][i1] = val5;
B[j+5][i1] = val6;
B[j+6][i1] = val7;
B[j+7][i1] = val8;
}
}
}
}
64*64
分为4*4块
char transpose2[]="Transpose 2";
void transpose22(int M,int N,int A[N][M],int B[M][N])
{
int i,j;
int i1;
int val1,val2,val3,val4,val5,val6,val7,val8;
for(i=0;i<64;i+=4){
for(j=0;j<64;j+=4){
for(i1=i;i1<i+4;i1+=2){
val1=A[i1][j+0];
val2=A[i1][j+1];
val3=A[i1][j+2];
val4=A[i1][j+3];
val5=A[i1+1][j+0];
val6=A[i1+1][j+1];
val7=A[i1+1][j+2];
val8=A[i1+1][j+3];
B[j+0][i1]=val1;
B[j+1][i1]=val2;
B[j+2][i1]=val3;
B[j+3][i1]=val4;
B[j+0][i1+1]=val5;
B[j+1][i1+1]=val6;
B[j+2][i1+1]=val7;
B[j+3][i1+1]=val8;
}
}
}
}
61*67
分成16*16块
char transpose3[]="Transpose 3";
void transpose33(int M,int N,int A[N][M],int B[M][N])
{
int ii,jj,i,j,val0,val1,val2,val3,val4,val5,val6,val7;
for(ii = 0; ii + 16 < N; ii += 16)
for(jj = 0; jj + 16 < M; jj += 16)
{
for(i = ii; i < ii + 16; i++)
{
val0 = A[i][jj + 0];
val1 = A[i][jj + 1];
val2 = A[i][jj + 2];
val3 = A[i][jj + 3];
val4 = A[i][jj + 4];
val5 = A[i][jj + 5];
val6 = A[i][jj + 6];
val7 = A[i][jj + 7];
B[jj + 0][i] = val0;
B[jj + 1][i] = val1;
B[jj + 2][i] = val2;
B[jj + 3][i] = val3;
B[jj + 4][i] = val4;
B[jj + 5][i] = val5;
B[jj + 6][i] = val6;
B[jj + 7][i] = val7;
val0 = A[i][jj + 8];
val1 = A[i][jj + 9];
val2 = A[i][jj + 10];
val3 = A[i][jj + 11];
val4 = A[i][jj + 12];
val5 = A[i][jj + 13];
val6 = A[i][jj + 14];
val7 = A[i][jj + 15];
B[jj + 8][i] = val0;
B[jj + 9][i] = val1;
B[jj + 10][i] = val2;
B[jj + 11][i] = val3;
B[jj + 12][i] = val4;
B[jj + 13][i] = val5;
B[jj + 14][i] = val6;
B[jj + 15][i] = val7;
}
}
for(i = ii; i < N; i++)
for(j = 0; j < M; j++)
B[j][i] = A[i][j];
for(i = 0; i < ii; i++)
for(j = jj; j < M; j++)
B[j][i] = A[i][j];
}
最后提交的函数:
把上面三个函数总结在一起即可。
char transpose_submit_desc[] = "Transpose submission";
void transpose_submit(int M, int N, int A[N][M], int B[M][N])
{
if(M==32)
return transpose11(32,32,A,B);
else if(M==64)
return transpose22(64,64,A,B);
else if(M==61)
return transpose33(M,N,A,B);
}