参考 陶辉极客时间 https://time.geekbang.org/column/article/230194
代码示例
#include "stdio.h"
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <iostream>
#include <unistd.h>
#include <pthread.h>
#include <sched.h>
using namespace std;
#define TESTN 16*1024L
bool setaffinity = false;
void* loopcalc(void* args) {
if (setaffinity) {
cpu_set_t mask; //CPU核的集合
cpu_set_t get; //获取在集合中的CPU
//获取线程的序列号
int *thread_num = (int *)args;
//将当前线程绑定至特定CPU
CPU_ZERO(&mask);
CPU_SET(*thread_num,&mask);
if (sched_setaffinity(0, sizeof(mask), &mask) == -1)
{
cout<<"warning: could not set CPU affinity, continuing...\n";
}
}
timeval tStart,tEnd;
//这里不再使用clock,因为clock表示的进程所占用过的CPU周期,它将所有CPU都计入了,不适合示例中的统计
gettimeofday(&tStart, 0);
//这个循环中由于反复访问有限的数组,CPU缓存命中率非常高
unsigned char* arr = new unsigned char[TESTN];
for (long i = 0; i < TESTN; i++) arr[i] = rand() % 256;
for (int j = 1; j < TESTN; j++) {
for (long i = 0; i < TESTN; i++) arr[i] += 1;
}
gettimeofday(&tEnd, 0);
//将消耗时间传出到timecost数组中对应的元素上
*(long*)args = (1000000LL * (tEnd.tv_sec-tStart.tv_sec) + (tEnd.tv_usec-tStart.tv_usec))/1000;
}
int main(int argc, char** argv) {
int threadnum = 2;
int ch;
while((ch = getopt(argc, argv, "t:fs")) != -1) {
switch(ch)
{
//设置测试的并发线程数,注意不要超过机器上的CPU核数
case 't':
threadnum = atoi(optarg);
break;
//将线程绑定至特定CPU上
case 'f':
setaffinity = true;
break;
//不绑定CPU
case 's':
setaffinity = false;
break;
}
}
pthread_t* id = new pthread_t[threadnum];
//统计每个线程计算所需要的时间
long* timecost = new long[threadnum];
for(int i = 0; i < threadnum; i++) {
//最初timecost用于传递线程号,用于绑定CPU
timecost[i] = i;
int ret=pthread_create(&id[i],NULL,loopcalc,&timecost[i]);
if(ret!=0){
cout<<"Create pthread error!\n";
exit (1);
}
}
long costsum = 0;
//等待所有线程结束
for(int i = 0; i < threadnum; i++) {
pthread_join(id[i],NULL);
costsum += timecost[i];
}
//比较平均每线程所用时间
cout<<"costsum: "<<costsum<<", avg: "<<costsum/threadnum<<endl;
}
1. 验证环境
操作系统: CentOS7.0
CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
GCC-C++: 4.8.5
JAVA: 1.8.0
Python: 2.7.5
2. C++程序traverse_2d_array.cpp
a. 编译程序
安装编译依赖的软件
如Linux中需要安装gcc-c++,CentOS中可用yum install gcc-c++安装,Ubuntu中可用apt-get install gcc-c++
编译程序
g++ cpu_migrate.cpp -o cpu_migrate -lpthread
注意,多线程依赖pthread库,编译时需要链接
b. 运行验证
使用14个(共28个CPU核心)并发线程测试,不绑定CPU
./cpu_migrate -t 14 -s 平均每线程消耗时间(毫秒):1083
使用14个(共28个CPU核心)并发线程测试,绑定CPU
./cpu_migrate -t 14 -f 平均每线程消耗时间(毫秒):926
c. 使用perf验证缓存命中率
使用14个(共28个CPU核心)并发线程测试,不绑定CPU
perf stat -e cpu-migrations,cache-references,cache-misses,instructions,cycles,L1-dcache-load-misses,L1-dcache-loads,L1-icache-load-misses,branch-load-misses,branch-loads ./cpu_migrate -t 14 -s
输出结果:
Performance counter stats for './cpu_migrate -t 14 -s':
10 cpu-migrations
8,193,825 cache-references (44.40%)
175,792 cache-misses # 2.145 % of all cache refs (44.34%)
45,480,238,906 instructions # 1.30 insn per cycle (55.47%)
35,111,144,560 cycles (55.47%)
11,997,428 L1-dcache-load-misses # 0.05% of all L1-dcache hits (55.57%)
26,407,960,253 L1-dcache-loads (55.60%)
2,459,766 L1-icache-load-misses (55.66%)
2,136,304 branch-load-misses (44.53%)
3,825,848,726 branch-loads (44.43%)
1.251076337 seconds time elapsed
14.630618000 seconds user
0.459616000 seconds sys
使用14个(共28个CPU核心)并发线程测试,绑定CPU
perf stat -e cpu-migrations,cache-references,cache-misses,instructions,cycles,L1-dcache-load-misses,L1-dcache-loads,L1-icache-load-misses,branch-load-misses,branch-loads ./cpu_migrate -t 14 -f
输出结果:
Performance counter stats for './cpu_migrate -t 14 -f':
14 cpu-migrations
4,983,541 cache-references (44.42%)
1,611,627 cache-misses # 32.339 % of all cache refs (44.34%)
45,523,818,723 instructions # 1.52 insn per cycle (55.43%)
29,972,627,158 cycles (55.46%)
5,812,831 L1-dcache-load-misses # 0.02% of all L1-dcache hits (55.53%)
26,388,005,477 L1-dcache-loads (55.58%)
1,262,533 L1-icache-load-misses (55.66%)
1,363,376 branch-load-misses (44.54%)
3,828,570,015 branch-loads (44.47%)
0.948650967 seconds time elapsed
12.489932000 seconds user
0.456253000 seconds sys