cpu cache valgrind cachegrind测试

最新推荐文章于 2024-06-02 16:01:00 发布

guoguangwu

最新推荐文章于 2024-06-02 16:01:00 发布

阅读量465

点赞数

本文链接：https://blog.csdn.net/guoguangwu/article/details/109446213

版权

#include <stdio.h>
#include <unistd.h>
#include <sys/time.h>
#include <stdlib.h>

int  MAX_SIZE = 10240;

int main()
{
	float time_use=0;

	struct timeval start;

	struct timeval end;
	
	int **array = NULL;
	int i,j, num = 0;

    printf("请输入行数\n");  
    scanf("%d",&MAX_SIZE);  
    array=(int**)malloc(sizeof(int*)*MAX_SIZE);  
    for(i=0;i<MAX_SIZE;i++)  
        array[i]=(int*)malloc(sizeof(int)*MAX_SIZE);  
	
	gettimeofday(&start,NULL); //gettimeofday(&start,&tz);结果一样
	printf("start.tv_sec:%d\n",start.tv_sec);
	printf("start.tv_usec:%d\n",start.tv_usec);	
	for(i = 0; i < MAX_SIZE; i++)
	{
		for(j = 0; j < MAX_SIZE; j++)
		{
			array[i][j] = num++;
		}
	}
	gettimeofday(&end,NULL);
	printf("end.tv_sec:%d\n",end.tv_sec);
	printf("end.tv_usec:%d\n",end.tv_usec);
	time_use=(end.tv_sec-start.tv_sec)*1000000+(end.tv_usec-start.tv_usec);//微秒
	printf("time_use is %f\n",time_use);
	
	num = 0;
	gettimeofday(&start,NULL); //gettimeofday(&start,&tz);结果一样
	printf("start.tv_sec:%d\n",start.tv_sec);
	printf("start.tv_usec:%d\n",start.tv_usec);
	//cache miss 操作
	for(i = 0; i < MAX_SIZE; i++)
	{
		for(j = 0; j < MAX_SIZE; j++)
		{
			array[j][i] = num++;
		}
	}

	gettimeofday(&end,NULL);
	printf("end.tv_sec:%d\n",end.tv_sec);
	printf("end.tv_usec:%d\n",end.tv_usec);
	time_use=(end.tv_sec-start.tv_sec)*1000000+(end.tv_usec-start.tv_usec);//微秒
	printf("time_use is %f\n",time_use);
	
	return 0;
}

测试结果:

测试结果相差一倍左右时间, 由于第二个循环操作,不是访问连续内存,导致cache miss,需要重新从内存获取数据,导致性能比较差.

valgrind 统计性能数据

cache miss 程序

#include <stdio.h>
#include <unistd.h>
#include <sys/time.h>
#include <stdlib.h>

int  MAX_SIZE = 10240;

int main()
{
	float time_use=0;
	struct timeval start;
	struct timeval end;
	int **array = NULL;
	int i,j, num = 0;

    printf("请输入行数\n");  
    scanf("%d",&MAX_SIZE);  
    array=(int**)malloc(sizeof(int*)*MAX_SIZE);  
    for(i=0;i<MAX_SIZE;i++)  
        array[i]=(int*)malloc(sizeof(int)*MAX_SIZE);  
	
	gettimeofday(&start,NULL); //gettimeofday(&start,&tz);结果一样
	printf("start.tv_sec:%d\n",start.tv_sec);
	printf("start.tv_usec:%d\n",start.tv_usec);
	
	for(i = 0; i < MAX_SIZE; i++)
	{
		for(j = 0; j < MAX_SIZE; j++)
		{
			array[j][i] = num++;
		}
	}

	gettimeofday(&end,NULL);
	printf("end.tv_sec:%d\n",end.tv_sec);
	printf("end.tv_usec:%d\n",end.tv_usec);
	time_use=(end.tv_sec-start.tv_sec)*1000000+(end.tv_usec-start.tv_usec);//微秒
	printf("time_use is %f\n",time_use);
	
	return 0;
}

测试结果

x03430-a@x03430-a:~/test/dpdk$ valgrind --tool=cachegrind  ./cache_test2
==16617== Cachegrind, a cache and branch-prediction profiler
==16617== Copyright (C) 2002-2017, and GNU GPL'd, by Nicholas Nethercote et al.
==16617== Using Valgrind-3.13.0 and LibVEX; rerun with -h for copyright info
==16617== Command: ./cache_test2
==16617== 
--16617-- warning: L3 cache found, using its data for the LL simulation.
请输入行数
1024
start.tv_sec:1604299021
start.tv_usec:132371
end.tv_sec:1604299021
end.tv_usec:214520
time_use is 82149.000000
==16617== 
==16617== I   refs:      19,310,723
==16617== I1  misses:         1,291
==16617== LLi misses:         1,252
==16617== I1  miss rate:       0.01%
==16617== LLi miss rate:       0.01%
==16617== 
==16617== D   refs:      10,629,320  (8,483,206 rd   + 2,146,114 wr)
==16617== D1  misses:     1,184,197  (  133,778 rd   + 1,050,419 wr)
==16617== LLd misses:        68,554  (    2,073 rd   +    66,481 wr)
==16617== D1  miss rate:       11.1% (      1.6%     +      48.9%  )
==16617== LLd miss rate:        0.6% (      0.0%     +       3.1%  )
==16617== 
==16617== LL refs:        1,185,488  (  135,069 rd   + 1,050,419 wr)
==16617== LL misses:         69,806  (    3,325 rd   +    66,481 wr)
==16617== LL miss rate:         0.2% (      0.0%     +       3.1%  )

正常代码:

#include <stdio.h>
#include <unistd.h>
#include <sys/time.h>
#include <stdlib.h>

int  MAX_SIZE = 10240;

int main()
{
	float time_use=0;
	struct timeval start;
	struct timeval end;
	int **array = NULL;
	int i,j, num = 0;

    printf("请输入行数\n");  
    scanf("%d",&MAX_SIZE);  
    array=(int**)malloc(sizeof(int*)*MAX_SIZE);  
    for(i=0;i<MAX_SIZE;i++)  
        array[i]=(int*)malloc(sizeof(int)*MAX_SIZE);  
	
	gettimeofday(&start,NULL); //gettimeofday(&start,&tz);结果一样
	printf("start.tv_sec:%d\n",start.tv_sec);
	printf("start.tv_usec:%d\n",start.tv_usec);	
	for(i = 0; i < MAX_SIZE; i++)
	{
		for(j = 0; j < MAX_SIZE; j++)
		{
			array[i][j] = num++;
		}
	}
	gettimeofday(&end,NULL);
	printf("end.tv_sec:%d\n",end.tv_sec);
	printf("end.tv_usec:%d\n",end.tv_usec);
	time_use=(end.tv_sec-start.tv_sec)*1000000+(end.tv_usec-start.tv_usec);//微秒
	printf("time_use is %f\n",time_use);
	
	
	return 0;
}

测试结果:

x03430-a@x03430-a:~/test/dpdk$ valgrind --tool=cachegrind  ./cache_test3
==16904== Cachegrind, a cache and branch-prediction profiler
==16904== Copyright (C) 2002-2017, and GNU GPL'd, by Nicholas Nethercote et al.
==16904== Using Valgrind-3.13.0 and LibVEX; rerun with -h for copyright info
==16904== Command: ./cache_test3
==16904== 
--16904-- warning: L3 cache found, using its data for the LL simulation.
请输入行数
1024
start.tv_sec:1604299026
start.tv_usec:556480
end.tv_sec:1604299026
end.tv_usec:624710
time_use is 68230.000000
==16904== 
==16904== I   refs:      19,310,708
==16904== I1  misses:         1,290
==16904== LLi misses:         1,251
==16904== I1  miss rate:       0.01%
==16904== LLi miss rate:       0.01%
==16904== 
==16904== D   refs:      10,629,314  (8,483,203 rd   + 2,146,111 wr)
==16904== D1  misses:        70,468  (    2,833 rd   +    67,635 wr)
==16904== LLd misses:        68,554  (    2,073 rd   +    66,481 wr)
==16904== D1  miss rate:        0.7% (      0.0%     +       3.2%  )
==16904== LLd miss rate:        0.6% (      0.0%     +       3.1%  )
==16904== 
==16904== LL refs:           71,758  (    4,123 rd   +    67,635 wr)
==16904== LL misses:         69,805  (    3,324 rd   +    66,481 wr)
==16904== LL miss rate:         0.2% (      0.0%     +       3.1%  )

可以对比出来,cache miss对程序的性能影响还是很大的.

guoguangwu

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
cpu cache valgrind cachegrind测试

#include <stdio.h>#include <unistd.h>#include <sys/time.h>#include <stdlib.h>int MAX_SIZE = 10240;int main(){ float time_use=0; struct timeval start; struct timeval end; int **array = NULL; int i,j, num = 0; .
复制链接

扫一扫