- //观察因为Cache未命中引起的程序执行速度变化
- // 传入参数, int型缓冲区指针,大小必须大于2*64*256 bytes
- // 里面的几个常量的值源自ARM920T的Cache结构
- void cache_miss_control(int *buf)
- {
- long tm; //起始时间
- long total; //运行时间
- int dumb; //固定的写入目标地址
- //cache size = 16KB
- const int cache_size = 16*1024;
- //cache line size = 32B
- const int cache_line_size = 32;
- //Cache共享区域(64路)
- const int cache_share_line_cnt = 64;
- //完全独立的内存区域尺寸 = 256B
- const int cache_independent_zone = (cache_line_size*(cache_size/(cache_line_size*cache_share_line_cnt)));
- //内层运算量4096次
- const int calc_cap = 4096;
- int loop, xx, yy, i, j;
- int *p;
- for(xx= 2*cache_share_line_cnt; xx >= 1; --xx) //改变内层循环步长,控制对Cache项目更新的数量
- {
- yy = calc_cap/xx; //保证运算量不变
- if(yy*xx != calc_cap) continue;
- SetTimeMark(tm); //计时开始
- for(loop = 0; loop<1024*4; ++loop) //延时
- {
- for(i = 0; i<yy; ++i) //外层循环
- {
- p = buf;
- for(j = 0; j<xx; ++j) //内层循环
- {
- dumb = *p;
- p += (cache_independent_zone)/sizeof(int); //p指针的步长保证总是访问同一组64路Cache区域
- }
- }
- }
- total = GetElapseTime(tm); //打印计时结果
- printf("takes %ld(uS)!(inner loop = %d)/n", total, xx);
- }
- }
- /*
- 结果:(这个结果我无法解释,本来是期待在inner loop 接近64的时候会观察到速度的逐渐降低的,谁能分析这段代码,在下感激不尽...)
- takes 378484(uS)!(inner loop = 1)
- takes 273772(uS)!(inner loop = 2)
- takes 220790(uS)!(inner loop = 4)
- takes 194486(uS)!(inner loop = 8)
- takes 181388(uS)!(inner loop = 16)
- takes 174774(uS)!(inner loop = 32)
- takes 171456(uS)!(inner loop = 64)
- */