1)块大小
思路:
假定块大小为8*4B(其中&为未命中,@为命中)
步长 | 命中情况 | |
1 | & @ @ @ | @ @ @ @ |
2 | & @ @ @ | & @ @ @ |
4 | & @ & @ | & @ & @ |
8 | & & & & | & & & & |
16 | & & & & | & & & & |
…… | …… | …… |
因为不命中时间要远远大于命中时间,所以步长为2时平均的运行时间应该是步长1的2倍,以此类推,到步长为8时最接近2倍(注:假定块中为8个字时),但步长再增大时,就应该不会再增加,或者增幅很小,代码如下:
#include
#include
#define N 256*1024*1024
int arr[N];
int main()
{
clock_t start, finish;
int count = N; //used to produce the average time
for(int j = 1; j <= 512; j <<= 1){ //j is the step,
//the blocksize can not exceed 512*4B
start = clock();
for(int i = 0; i < N; i += j){ // N for the accurate result
arr[i] = 1;
}
finish = clock();
printf("when the step is %3d, it takes %f clock numbers\n",
j, (double)(finish-start) / count);
count >>= 1;
}
return 0;
}
2)容量及层级数
思路:
不妨假定,cache是1KB的整数倍,可能值为1KB,2KB,4KB……8MB(即代码中的length),由上一步可知块大小为64B,每块有16字。每次访问保证在1KB或者2KB或者4KB……的大小内访问,假设L1大小为16KB,当刚超出范围超出16KB时,访问时间会出现一个急剧的上升,因为这时L1 cache大小已不满足需要,必需借助L2,据此写如下代码:
#include
#include
#define PATH "/home/deropty/cache/cachesize.txt"
#define N 2*1024*1024 //the maxsize is 2*1024*1024*4B=8MB
int arr[N];
int loop = (1<<24);
int main()
{
clock_t start, finish;
int count = 1;
FILE *fp = NULL;
fp = fopen(PATH, "w");
for(int length = 256; length <= N; length += 256){
//256*4B=1KB,the step is 1KB
//length controls the range
start = clock();
int lengthmod = length - 1;
for(int i = 0; i < loop; ++i){
++arr[(i*16) & lengthmod]; //i*16 to save the time
}
finish = clock();
printf("%4dk: when the array length is %7d, the cost time is %7.3fms\n",
count,length,(double)(finish-start) / 1000);
fprintf(fp, "%4dk\t\t%7.3fms\n", count, (double)(finish-start) / 1000);
++count;
}
fclose(fp);
fp = NULL;
return 0;
}
3)命中时间和缺失代价
由1)已知块大小为16字,所以每次访问16倍数个字,让每次都缺失,对大量缺失时间做均值即可求出缺失代价。对于命中时间可一直访问同一数据,对大量命中时间的数据做均值也可得出命中时间,代码如下:#include
#include
#include
#define N 1024*1024
int loop = (1<<25);
int main()
{
int arr[N];
clock_t start, finish;
double sumtime = 0;
for(int k = 0; k < 16; k ++){
start = clock();
for(int i = 0; i < loop; ++i){
int index = (i * 16) & (N-1); //gurantee that it does miss
arr[index] *= 3; //meaningless
}
finish = clock();
int duration = finish - start;
start = clock(); //minus the time taked by lopps
for(int i = 0; i < loop; ++i){
int index = (i*16) & (N-1);
}
finish = clock();
double tmp = (double)(duration - (finish-start)) / loop;
sumtime += tmp;
printf("the miss time is %fus\n", tmp);
}
printf("\nThe average miss time is %fus\n", sumtime / 16);
sumtime = 0;
for(int k = 0; k < 16; k ++){
arr[0] = 1;
start = clock();
for(int i = 0; i < loop; ++i){
arr[0] *= 3; //always visit the arr[0] ;)
}
finish = clock();
int duration = finish - start;
start = clock();
for(int i = 0; i < loop; ++i){
}
finish = clock();
double tmp = (double)(duration - (finish-start)) / loop;
sumtime += tmp;
printf("the hit time is %fus\n", tmp);
}
printf("\nThe average hit time is %fus\n", sumtime / 16);
return 0;
}
4)求缓存的关联度
思路:
假设有每个缓存有16个块,则直接映射情况如下:
块编号 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
块 | @ | @ | @ | @ | @ | @ | @ | @ | @ | @ | @ | @ | @ | @ | @ | @ |
映射情况 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | |
32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | |
48 | 48 | 50 | 51 | 52 | … |
|
|
|
|
|
|
|
|
|
|
2路组相联映射情况:
块编号 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
块 | @ | @ | @ | @ | @ | @ | @ | @ |
@ | @ | @ | @ | @ | @ | @ | @ | |
映射情况 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | |
16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | |
24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | |
32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | |
40 | 41 | 42 | … |
|
|
|
|
4路组相联映射情况
块编号 | 0 | 1 | 2 | 3 |
块 | @ | @ | @ | @ |
@ | @ | @ | @ | |
@ | @ | @ | @ | |
@ | @ | @ | @ | |
映射情况 | 0 | 1 | 2 | 3 |
4 | 5 | 6 | 7 | |
8 | 9 | 10 | 11 | |
12 | 13 | 14 | 15 | |
16 | 17 | 18 | 19 | |
20 | 21 | 22 | 23 | |
24 | 25 | 26 | 27 | |
28 | 29 | 30 | 31 | |
32 | 33 | 34 | 35 | |
36 | 37 | … |
|
如上图所示,对于编号为0和16的块,不停访问,在直接映射中会产生冲突,但对2路、4路、8路……则不会冲突;对于编号为0、16、32、48的块,不停访问,在1路、2路中会产生冲突,但对于4路、8路、16路……则不会冲突,以此类推,我们让程序每次都出现可能导致块缺失的情况,即访问编号为0、16的多次,计算平均访问时间;访问0、16、32、48的块多次,计算平均访问时间;访问编号为0、16、32、48、64、80的块多次,计算平均访问时间……假定CPU为4路组相联,则不停访问0、16、32、48不会出现缺失,而再增大为0、16、32、48、64,就会出现不停缺失的情况,据此思路,写出代码如下:
#include
#include
#define N (1<<8<<10<<10)
int arr[N];
int loop = (1<<16); //for producing a accurate result
int main()
{
clock_t start, finish;
int sumtime = 0;
for(int asso = 2; asso <= 16; asso += 2){
//suppose that associative is in (2-16)
start = clock();
for(int k = 0; k < loop; ++k)
for(int i = 0; i < asso; ++i)
arr[i<<9<<4] = 0; //512blocks in cache L1, 16 int/block
finish = clock();
int duration = finish - start;
start = clock(); //again for accuraty
for(int k = 0; k < loop; ++k)
for(int i = 0; i < asso; ++i)
i<<9<<4<<4;
finish = clock();
printf("%2dways associative's average time is %fus\n",
asso, (double)(duration - (finish-start)) / (loop*asso));
}
return 0;
}
上述代码还有很多问题,但现在作为一个阶段的总结,先贴出来,以后有时间再回来修改(估计没时间了,哈哈)