CSAPP3.0-CacheLab

本文档描述了一个缓存模拟器的实现,采用LRU策略,并提供了详细的命中、未命中和驱逐计数。此外,还讨论了针对不同矩阵大小的转置优化策略,包括使用blocking技术减少缓存未命中。测试结果展示了优化效果。
摘要由CSDN通过智能技术生成

Part A:编写缓存模拟器

  • b (block bits) 不需要模拟
  • 使用LRU策略
  • 使用verbose输出详细信息(hit, miss, eviction)便于调试
  • I指令忽略
  • LS指令执行单步HitMissEviction(缓存加载)
  • M指令执行两步HitMissEviction(缓存加载)

getopt的使用方法:

请添加图片描述
请添加图片描述
请添加图片描述
fscanf的用法

请添加图片描述
请添加图片描述

使用的库函数

#include "cachelab.h"
#include <stdio.h>			// fopen
#include <stdint.h>			// uintN_t
#include <unistd.h>			// getopt
#include <getopt.h>
#include <stdlib.h>			// atol
#include <errno.h>			// errno

#define false 0
#define true 1

数据结构

typedef struct{
	_Bool valid;						// flag whether this line/block is valid
	uint64_t tag;						// identifier to choose line/block
	uint64_t time_counter;	// LRU strategy counter, we should evict the block who has the min time, zero at first.
	// We don't need to simulate the block, since we just requested to count hit/miss/eviction
} line;
typedef line *entry_of_lines;
typedef entry_of_lines *entry_of_sets;

typedef struct{
	int hit;
	int miss;
	int eviction;
}result;

实现方法

entry_of_sets InitializeCache(uint64_t S, uint64_t E);

result HitMissEviction(entry_of_lines search_line, result Result, uint64_t E, uint64_t tag, _Bool verbose);

result ReadAndTest(FILE *tracefile, entry_of_sets cache, uint64_t S, uint64_t E, uint64_t s, uint64_t b, _Bool verbose);

void RealseMemory(entry_of_sets cache, uint64_t S, uint64_t E);

主函数读取指令

result Result = {0, 0, 0};
const char *help_message = "Usage: \"Your complied program\" [-hv] -s [S] -E <E> -b <b> -t <tracefile\n>" \
											"<s> <E> <b> should all above zero and below 64.\n" \
											"Complied with std=c99\n";
const char *command_options = "hvs:E:b:t:";
FILE* tracefile = NULL;
entry_of_sets cache = NULL;
_Bool verbose = false;
uint64_t s = 0;
uint64_t b = 0;
uint64_t S = 0;
uint64_t E = 0;

char ch; 				// command option
while((ch = getopt(argc, argv, command_options)) != -1){
	switch(ch){
		case 'h':
			printf("%s", help_message);
			exit(EXIT_SUCCESS);		
		case 'v':
			verbose = true;
			break;			
		case 's':
			if(atol(optarg) <= 0){
				printf("%s", help_message);
				exit(EXIT_FAILURE);
			}
			s = atol(optarg);
			S = 1 << s;
			break;			
		case 'E':
			if(atol(optarg) <= 0){
				printf("%s", help_message);
				exit(EXIT_FAILURE);
			}
			E = atol(optarg);
			break;			
		case 'b':
			if(atol(optarg) <= 0){
				printf("%s", help_message);
				exit(EXIT_FAILURE);
			}
			b = atol(optarg);
			break;			
		case 't':
			if((tracefile = fopen(optarg, "r")) == NULL){
				perror("Failed to open tracefile");
				exit(EXIT_FAILURE);
			}
			break;			
		default:
			printf("%s", help_message);
			exit(EXIT_FAILURE);			
	}
}
if(s == 0 || b == 0 || E == 0 || tracefile == NULL){
	printf("%s", help_message);
	exit(EXIT_FAILURE);
}

调用函数

cache = InitializeCache(S, E);
Result = ReadAndTest(tracefile, cache, S, E, s, b, verbose);
fclose(tracefile); // remember to close the file
RealseMemory(cache, S, E);
printSummary(Result.hit, Result.miss, Result.evicton);

cache的内存分配与释放

entry_of_sets InitializeCache(uint64_t S, uint64_t E){
	entry_of_sets cache;
	if((cache = calloc(S, sizeof(entry_of_lines))) == NULL){
		perror("Failed to calloc entry_of_sets");
		exit(EXIT_FAILURE);
	}
	for(int i = 0; i < S; i++){
		if((cache[i] = calloc(E, sizeof(line))) == NULL){
			perror("Failed to calloc line in sets");
		}
	}
	return cache;
}
void RealseMemory(entry_of_sets cache, uint64_t S, uint64_t E){
	for(uint64_t i = 0; i < S; i++){
		free(cache[i]);
	}
	free(cache);
}

核心代码-ReadAndTest

result ReadAndTest(FILE *tracefile, entry_of_sets cache, uint64_t S, uint64_t E, uint64_t s, uint64_t b, _Bool verbose){
	result Result = {0, 0, 0};
	char ch;
	uint64_t address;
	while((fscanf(tracefile, " %c %lx%*[^\n]", &ch, &address)) == 2){
		if(ch == 'I')
			continue;
		else{
			uint64_t set_index_mask = (1 << s) - 1;
			uint64_t set_index = (address >> b) & set_index_mask;
			uint64_t tag = (address >> b) >> s;
			entry_of_lines search_line = cache[set_index];

			if(ch == 'L' || ch == 'S'){				// data load or store
				if(verbose) printf("%c %lx ", ch, address);
				Result = HitMissEviction(search_line, Result, E, tag, verbose);
			}
			else if(ch == 'M'){
				if(verbose) printf("%c %lx ", ch, address);
				Result = HitMissEviction(search_line, Result, E, tag, verbose);		// load, hit/miss(+eviction)
				Result = HitMissEviction(search_line, Result, E, tag, verbose);		// store, must hit
			}
		}
	}
	return Result;
}

核心代码-HitMissEviction

result HitMissEviction(entry_of_lines search_line, result Result, uint64_t E, uint64_t tag, _Bool verbose){
	uint64_t oldest_time = UINT64_MAX;
	uint64_t youngest_time = 0;
	uint64_t oldest_block = UINT64_MAX;
	_Bool hit_flag = false;
	for(uint64_t i = 0; i < E; i++){
		if(search_line[i].tag == tag && search_line[i].valid){
			if(verbose) printf("hit\n");
			hit_flag = true;
			++Result.hit;
			++search_line[i].time_counter;		// update the time counter
			break;
		}
	}
	if(!hit_flag){			// miss
		if(verbose) printf("miss");
		++Result.miss;
		uint64_t i;
		for(i = 0; i < E; i++){
			if(search_line[i].time_counter < oldest_time){
				oldest_time = search_line[i].time_counter;
				oldest_block = i;
			}
			if(search_line[i].time_counter > youngest_time){
				youngest_time = search_line[i].time_counter;
			}
		}
		search_line[oldest_block].time_counter = youngest_time + 1;
		search_line[oldest_block].tag = tag;
		if(search_line[oldest_block].valid){
			if(verbose) printf(" and eviction\n");
			++Result.eviction;
		}else{
			if(verbose) printf("\n");
			search_line[oldest_block].valid = true;
		}
	}
	return Result;
}

测试结果:

请添加图片描述

Part B:优化矩阵转置

  • 1kb cache
  • direction (E = 1)
  • block size is 32 (b = 5)
  • sets 32 (s = 5)
  • 使用blocking优化技术:waside-blocking
M = N = 32

由于block大小为32bytes,故一个block可以放8个int,而block的个数又有32个,一行可以放4个block,所以把cache全部填满可以放8行。每8行会有冲突,因此使用blocking把每个块的大小设置为8。为了提高AB数组的击中率,使用8个局部(没有超过题目要求的最多12个)变量缓存A数组中的值,这样每次内循环只有两次未击中。

未击中计算:
32 8 ∗ 32 8 ∗ 8 ∗ 2 = 4 ∗ 4 ∗ 8 ∗ 2 = 256                \frac{32}{8}*\frac{32}{8}*8*2 \\ =4*4*8*2 \\ =256 \ \ \ \ \ \ \ \ \ \ \ \ \ \ 83283282=4482=256              

void transpose_submit(int M, int N, int A[N][M], int B[M][N]){
    for(int i = 0; i < N; i += 8){
        for(int j = 0; j < M; j += 8){
            for(int k = i; k < i + 8; k++){
                int temp_value0 = A[k][j];
                int temp_value1 = A[k][j+1];
                int temp_value2 = A[k][j+2];
                int temp_value3 = A[k][j+3];
                int temp_value4 = A[k][j+4];
                int temp_value5 = A[k][j+5];
                int temp_value6 = A[k][j+6];
                int temp_value7 = A[k][j+7];
                
                B[j][k] = temp_value0;
                B[j+1][k] = temp_value1;
                B[j+2][k] = temp_value2;
                B[j+3][k] = temp_value3;
                B[j+4][k] = temp_value4;
                B[j+5][k] = temp_value5;
                B[j+6][k] = temp_value6;
                B[j+7][k] = temp_value7;
            }
        }
    }
}

测试结果:

请添加图片描述
可能是逐出的时候多了31个miss,具体为什么比理论多了31个miss不太清楚。。。

M = N = 64

此时,数组一行有64个int,即8个block,所以每四行就会填满cache,即两个元素相差4行就会发生冲突。如果使用和M=N=32时一样的策略,使用4*4的blocking,每次都会有1/2的损失,则miss理论上为:
64 4 ∗ 64 4 ∗ 4 ∗ 2 = 16 ∗ 16 ∗ 8 = 2048             \frac{64}{4}*\frac{64}{4}*4*2 \\ =16*16*8 \\ =2048 \ \ \ \ \ \ \ \ \ \ \ 46446442=16168=2048           

优化不够。如果使用刚刚8*8的blocking,则在写入时就会发生冲突:

请添加图片描述
64 8 ∗ 64 8 ∗ 8 ∗ ( 2 + 1 ) = 8 ∗ 8 ∗ 8 ∗ 3 = 1536               \frac{64}{8}*\frac{64}{8}*8*(2+1) \\ =8*8*8*3 \\ =1536 \ \ \ \ \ \ \ \ \ \ \ \ \ 8648648(2+1)=8883=1536             
依然优化不够。

此时可以使用“分治思想”,将8*8的块分为4个4*4的块:

请添加图片描述

使用如下方法,在转换3、4部分的同时,将2放入正确位置:

请添加图片描述

void transpose_submit(int M, int N, int A[N][M], int B[M][N]){
    for(int i = 0; i < N; i += 8){
        for(int j = 0; j < M; j += 8){
            for(int k = i; k < i + 4; k++){
                int temp_value0 = A[k][j];
                int temp_value1 = A[k][j+1];
                int temp_value2 = A[k][j+2];
                int temp_value3 = A[k][j+3];
                int temp_value4 = A[k][j+4];
                int temp_value5 = A[k][j+5];
                int temp_value6 = A[k][j+6];
                int temp_value7 = A[k][j+7];
                
                B[j][k] = temp_value0;
                B[j+1][k] = temp_value1;
                B[j+2][k] = temp_value2;
                B[j+3][k] = temp_value3;
                // 逆序放置
                B[j][k+4] = temp_value7;
                B[j+1][k+4] = temp_value6;
                B[j+2][k+4] = temp_value5;
                B[j+3][k+4] = temp_value4;
            }
            for(int l = 0; l < 4; l++){
                /* 按列读取 */
                int temp_value0 = A[i+4][j+3-l];
                int temp_value1 = A[i+5][j+3-l];
                int temp_value2 = A[i+6][j+3-l];
                int temp_value3 = A[i+7][j+3-l];
                int temp_value4 = A[i+4][j+4+l];
                int temp_value5 = A[i+5][j+4+l];
                int temp_value6 = A[i+6][j+4+l];
                int temp_value7 = A[i+7][j+4+l];

               /* 从下向上按行转换2到3 */
                B[j+4+l][i] = B[j+3-l][i+4];
                B[j+4+l][i+1] = B[j+3-l][i+5];
                B[j+4+l][i+2] = B[j+3-l][i+6];
                B[j+4+l][i+3] = B[j+3-l][i+7];
               /* 将3 4放到正确的位置 */
                B[j+3-l][i+4] = temp_value0;
                B[j+3-l][i+5] = temp_value1;
                B[j+3-l][i+6] = temp_value2;
                B[j+3-l][i+7] = temp_value3;
                B[j+4+l][i+4] = temp_value4;
                B[j+4+l][i+5] = temp_value5;
                B[j+4+l][i+6] = temp_value6;
                B[j+4+l][i+7] = temp_value7;
            }
        }
    }
}

测试结果:

请添加图片描述

M = 61 N = 67

此题比较宽松,使用16*16的blocking,仅处理了对角线上的冲突。

void transpose_submit(int M, int N, int A[N][M], int B[M][N]){
    for(int i = 0; i < N; i += 16){
        for(int j = 0; j < M; j += 16){
            for(int k = i; k < i + 16 && k < N; k++){
                int temp_position = -1;
                int temp_value = 0;
                for(int l = j; l < j + 16 && l < M; l++){
                    if(k == l){
                        temp_position = k;
                        temp_value = A[k][k];
                    }else{
                        B[l][k] = A[k][l];
                    }
                }
                if(temp_position != -1){	// 对角线冲突
                    B[temp_position][temp_position] = temp_value;
                }
            }
        }
    }
}

测试结果:
请添加图片描述

最终满分结果:

请添加图片描述

参考:CS:APP3e 深入理解计算机系统_3e CacheLab实验

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

刷子c

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值