Part A:编写缓存模拟器
- b (block bits) 不需要模拟
- 使用LRU策略
- 使用verbose输出详细信息(hit, miss, eviction)便于调试
I
指令忽略L
、S
指令执行单步HitMissEviction
(缓存加载)M
指令执行两步HitMissEviction
(缓存加载)
getopt
的使用方法:
fscanf
的用法
使用的库函数
#include "cachelab.h"
#include <stdio.h> // fopen
#include <stdint.h> // uintN_t
#include <unistd.h> // getopt
#include <getopt.h>
#include <stdlib.h> // atol
#include <errno.h> // errno
#define false 0
#define true 1
数据结构
typedef struct{
_Bool valid; // flag whether this line/block is valid
uint64_t tag; // identifier to choose line/block
uint64_t time_counter; // LRU strategy counter, we should evict the block who has the min time, zero at first.
// We don't need to simulate the block, since we just requested to count hit/miss/eviction
} line;
typedef line *entry_of_lines;
typedef entry_of_lines *entry_of_sets;
typedef struct{
int hit;
int miss;
int eviction;
}result;
实现方法
entry_of_sets InitializeCache(uint64_t S, uint64_t E);
result HitMissEviction(entry_of_lines search_line, result Result, uint64_t E, uint64_t tag, _Bool verbose);
result ReadAndTest(FILE *tracefile, entry_of_sets cache, uint64_t S, uint64_t E, uint64_t s, uint64_t b, _Bool verbose);
void RealseMemory(entry_of_sets cache, uint64_t S, uint64_t E);
主函数读取指令
result Result = {0, 0, 0};
const char *help_message = "Usage: \"Your complied program\" [-hv] -s [S] -E <E> -b <b> -t <tracefile\n>" \
"<s> <E> <b> should all above zero and below 64.\n" \
"Complied with std=c99\n";
const char *command_options = "hvs:E:b:t:";
FILE* tracefile = NULL;
entry_of_sets cache = NULL;
_Bool verbose = false;
uint64_t s = 0;
uint64_t b = 0;
uint64_t S = 0;
uint64_t E = 0;
char ch; // command option
while((ch = getopt(argc, argv, command_options)) != -1){
switch(ch){
case 'h':
printf("%s", help_message);
exit(EXIT_SUCCESS);
case 'v':
verbose = true;
break;
case 's':
if(atol(optarg) <= 0){
printf("%s", help_message);
exit(EXIT_FAILURE);
}
s = atol(optarg);
S = 1 << s;
break;
case 'E':
if(atol(optarg) <= 0){
printf("%s", help_message);
exit(EXIT_FAILURE);
}
E = atol(optarg);
break;
case 'b':
if(atol(optarg) <= 0){
printf("%s", help_message);
exit(EXIT_FAILURE);
}
b = atol(optarg);
break;
case 't':
if((tracefile = fopen(optarg, "r")) == NULL){
perror("Failed to open tracefile");
exit(EXIT_FAILURE);
}
break;
default:
printf("%s", help_message);
exit(EXIT_FAILURE);
}
}
if(s == 0 || b == 0 || E == 0 || tracefile == NULL){
printf("%s", help_message);
exit(EXIT_FAILURE);
}
调用函数
cache = InitializeCache(S, E);
Result = ReadAndTest(tracefile, cache, S, E, s, b, verbose);
fclose(tracefile); // remember to close the file
RealseMemory(cache, S, E);
printSummary(Result.hit, Result.miss, Result.evicton);
cache的内存分配与释放
entry_of_sets InitializeCache(uint64_t S, uint64_t E){
entry_of_sets cache;
if((cache = calloc(S, sizeof(entry_of_lines))) == NULL){
perror("Failed to calloc entry_of_sets");
exit(EXIT_FAILURE);
}
for(int i = 0; i < S; i++){
if((cache[i] = calloc(E, sizeof(line))) == NULL){
perror("Failed to calloc line in sets");
}
}
return cache;
}
void RealseMemory(entry_of_sets cache, uint64_t S, uint64_t E){
for(uint64_t i = 0; i < S; i++){
free(cache[i]);
}
free(cache);
}
核心代码-ReadAndTest
result ReadAndTest(FILE *tracefile, entry_of_sets cache, uint64_t S, uint64_t E, uint64_t s, uint64_t b, _Bool verbose){
result Result = {0, 0, 0};
char ch;
uint64_t address;
while((fscanf(tracefile, " %c %lx%*[^\n]", &ch, &address)) == 2){
if(ch == 'I')
continue;
else{
uint64_t set_index_mask = (1 << s) - 1;
uint64_t set_index = (address >> b) & set_index_mask;
uint64_t tag = (address >> b) >> s;
entry_of_lines search_line = cache[set_index];
if(ch == 'L' || ch == 'S'){ // data load or store
if(verbose) printf("%c %lx ", ch, address);
Result = HitMissEviction(search_line, Result, E, tag, verbose);
}
else if(ch == 'M'){
if(verbose) printf("%c %lx ", ch, address);
Result = HitMissEviction(search_line, Result, E, tag, verbose); // load, hit/miss(+eviction)
Result = HitMissEviction(search_line, Result, E, tag, verbose); // store, must hit
}
}
}
return Result;
}
核心代码-HitMissEviction
result HitMissEviction(entry_of_lines search_line, result Result, uint64_t E, uint64_t tag, _Bool verbose){
uint64_t oldest_time = UINT64_MAX;
uint64_t youngest_time = 0;
uint64_t oldest_block = UINT64_MAX;
_Bool hit_flag = false;
for(uint64_t i = 0; i < E; i++){
if(search_line[i].tag == tag && search_line[i].valid){
if(verbose) printf("hit\n");
hit_flag = true;
++Result.hit;
++search_line[i].time_counter; // update the time counter
break;
}
}
if(!hit_flag){ // miss
if(verbose) printf("miss");
++Result.miss;
uint64_t i;
for(i = 0; i < E; i++){
if(search_line[i].time_counter < oldest_time){
oldest_time = search_line[i].time_counter;
oldest_block = i;
}
if(search_line[i].time_counter > youngest_time){
youngest_time = search_line[i].time_counter;
}
}
search_line[oldest_block].time_counter = youngest_time + 1;
search_line[oldest_block].tag = tag;
if(search_line[oldest_block].valid){
if(verbose) printf(" and eviction\n");
++Result.eviction;
}else{
if(verbose) printf("\n");
search_line[oldest_block].valid = true;
}
}
return Result;
}
测试结果:
Part B:优化矩阵转置
- 1kb cache
- direction (E = 1)
- block size is 32 (b = 5)
- sets 32 (s = 5)
- 使用blocking优化技术:waside-blocking
M = N = 32
由于block大小为32bytes,故一个block可以放8个int,而block的个数又有32个,一行可以放4个block,所以把cache全部填满可以放8行。每8行会有冲突,因此使用blocking把每个块的大小设置为8。为了提高A
、B
数组的击中率,使用8个局部(没有超过题目要求的最多12个)变量缓存A
数组中的值,这样每次内循环只有两次未击中。
未击中计算:
32
8
∗
32
8
∗
8
∗
2
=
4
∗
4
∗
8
∗
2
=
256
\frac{32}{8}*\frac{32}{8}*8*2 \\ =4*4*8*2 \\ =256 \ \ \ \ \ \ \ \ \ \ \ \ \ \
832∗832∗8∗2=4∗4∗8∗2=256
void transpose_submit(int M, int N, int A[N][M], int B[M][N]){
for(int i = 0; i < N; i += 8){
for(int j = 0; j < M; j += 8){
for(int k = i; k < i + 8; k++){
int temp_value0 = A[k][j];
int temp_value1 = A[k][j+1];
int temp_value2 = A[k][j+2];
int temp_value3 = A[k][j+3];
int temp_value4 = A[k][j+4];
int temp_value5 = A[k][j+5];
int temp_value6 = A[k][j+6];
int temp_value7 = A[k][j+7];
B[j][k] = temp_value0;
B[j+1][k] = temp_value1;
B[j+2][k] = temp_value2;
B[j+3][k] = temp_value3;
B[j+4][k] = temp_value4;
B[j+5][k] = temp_value5;
B[j+6][k] = temp_value6;
B[j+7][k] = temp_value7;
}
}
}
}
测试结果:
可能是逐出的时候多了31个miss,具体为什么比理论多了31个miss不太清楚。。。
M = N = 64
此时,数组一行有64个int,即8个block,所以每四行就会填满cache,即两个元素相差4行就会发生冲突。如果使用和M=N=32
时一样的策略,使用4*4的blocking,每次都会有1/2的损失,则miss理论上为:
64
4
∗
64
4
∗
4
∗
2
=
16
∗
16
∗
8
=
2048
\frac{64}{4}*\frac{64}{4}*4*2 \\ =16*16*8 \\ =2048 \ \ \ \ \ \ \ \ \ \ \
464∗464∗4∗2=16∗16∗8=2048
优化不够。如果使用刚刚8*8的blocking,则在写入时就会发生冲突:
64
8
∗
64
8
∗
8
∗
(
2
+
1
)
=
8
∗
8
∗
8
∗
3
=
1536
\frac{64}{8}*\frac{64}{8}*8*(2+1) \\ =8*8*8*3 \\ =1536 \ \ \ \ \ \ \ \ \ \ \ \ \
864∗864∗8∗(2+1)=8∗8∗8∗3=1536
依然优化不够。
此时可以使用“分治思想”,将8*8的块分为4个4*4的块:
使用如下方法,在转换3、4部分的同时,将2放入正确位置:
void transpose_submit(int M, int N, int A[N][M], int B[M][N]){
for(int i = 0; i < N; i += 8){
for(int j = 0; j < M; j += 8){
for(int k = i; k < i + 4; k++){
int temp_value0 = A[k][j];
int temp_value1 = A[k][j+1];
int temp_value2 = A[k][j+2];
int temp_value3 = A[k][j+3];
int temp_value4 = A[k][j+4];
int temp_value5 = A[k][j+5];
int temp_value6 = A[k][j+6];
int temp_value7 = A[k][j+7];
B[j][k] = temp_value0;
B[j+1][k] = temp_value1;
B[j+2][k] = temp_value2;
B[j+3][k] = temp_value3;
// 逆序放置
B[j][k+4] = temp_value7;
B[j+1][k+4] = temp_value6;
B[j+2][k+4] = temp_value5;
B[j+3][k+4] = temp_value4;
}
for(int l = 0; l < 4; l++){
/* 按列读取 */
int temp_value0 = A[i+4][j+3-l];
int temp_value1 = A[i+5][j+3-l];
int temp_value2 = A[i+6][j+3-l];
int temp_value3 = A[i+7][j+3-l];
int temp_value4 = A[i+4][j+4+l];
int temp_value5 = A[i+5][j+4+l];
int temp_value6 = A[i+6][j+4+l];
int temp_value7 = A[i+7][j+4+l];
/* 从下向上按行转换2到3 */
B[j+4+l][i] = B[j+3-l][i+4];
B[j+4+l][i+1] = B[j+3-l][i+5];
B[j+4+l][i+2] = B[j+3-l][i+6];
B[j+4+l][i+3] = B[j+3-l][i+7];
/* 将3 4放到正确的位置 */
B[j+3-l][i+4] = temp_value0;
B[j+3-l][i+5] = temp_value1;
B[j+3-l][i+6] = temp_value2;
B[j+3-l][i+7] = temp_value3;
B[j+4+l][i+4] = temp_value4;
B[j+4+l][i+5] = temp_value5;
B[j+4+l][i+6] = temp_value6;
B[j+4+l][i+7] = temp_value7;
}
}
}
}
测试结果:
M = 61 N = 67
此题比较宽松,使用16*16的blocking,仅处理了对角线上的冲突。
void transpose_submit(int M, int N, int A[N][M], int B[M][N]){
for(int i = 0; i < N; i += 16){
for(int j = 0; j < M; j += 16){
for(int k = i; k < i + 16 && k < N; k++){
int temp_position = -1;
int temp_value = 0;
for(int l = j; l < j + 16 && l < M; l++){
if(k == l){
temp_position = k;
temp_value = A[k][k];
}else{
B[l][k] = A[k][l];
}
}
if(temp_position != -1){ // 对角线冲突
B[temp_position][temp_position] = temp_value;
}
}
}
}
}
测试结果:
最终满分结果: