在ubuntu下生成大文件,并测试文件读写效率

最新推荐文章于 2024-06-07 09:47:54 发布

帐前卒

最新推荐文章于 2024-06-07 09:47:54 发布

阅读量2.9k

点赞数 1

分类专栏： c/c++ 数据库学习文章标签：测试 ubuntu random file timezone struct

本文链接：https://blog.csdn.net/cctt_1/article/details/5618652

版权

c/c++ 同时被 2 个专栏收录

76 篇文章 0 订阅

订阅专栏

数据库学习

27 篇文章 0 订阅

订阅专栏

在ubuntu或者其他的linux下生成大文件,必须使用编译参数-D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64.否则生成的文件一定是2G左右的,原因是fseek,这个函数中的offset参数是一个long型的或者是size_t型的,在fseeko中是off_t,但是这个off_t默认还是32位,只有加了 -D_FILE_OFFSET_BITS=64的宏定义才被作为64位对待.

写了下面一段代码来生成大文件,传说有一种方法:直接跳转nG位置之后,然后写入一个字符.这种方法可能更快,不过我为了保守的生成文件,还是使用下面的一个写法:

#include<stdio.h> #include<stdlib.h> #include<unistd.h> #include<string.h> void gen_file(const char *filename, int block_size, int count){ int is_exist = 0; FILE* file = fopen(filename,"w+b"); char * buf = NULL; int i=0; long long size = block_size; if(file == NULL){ printf("fatal error in gen_file: fopen/n"); exit(1); } buf = (char *)malloc(sizeof(char)*block_size); memset(buf,'1',sizeof(char)*block_size); while(i++<count){ if(i % 10 == 0){ printf("count is %d, size is %lld/n",i,size); } fwrite(buf,block_size,1,file); size += block_size; fseeko(file,size,SEEK_SET); } fclose(file); free(buf); /* long long size = 1024*1024*1024; size *=10; fseeko(file,size,SEEK_SET); fwrite(buf,block_size,1,file); fclose(file); free(buf); */ } int main(){ printf("size of char %d/n",sizeof(char)); int block_size = 1024*1024*20; // 20M int count = 50; gen_file("smallfile",block_size,count*2);// gen 2G file return 0; }

int block_size = 1024*1024*20; // 20M

这个是每次写入的数据大小,gen_file("smallfile",block_size,count*2);第三个参数可以调节生成文件的大小.

下面写了一个测试大文件和小文件的随机读写和顺序读写.

#include<stdio.h> #include<time.h> #include<sys/stat.h> #include<sys/types.h> #include<unistd.h> #include<stdlib.h> #include<sys/time.h> #include<string.h> const int block_size = 8092; struct timeval start; struct timeval end; struct timezone tz; void start_time(){ gettimeofday(&start,&tz); } void end_time(){ gettimeofday(&end,&tz); } void print_time(){ printf("%lf sec/n",(end.tv_sec-start.tv_sec)+(double)(end.tv_usec-start.tv_usec)/1000000); } size_t look_size(const char * filename){ struct stat st; stat(filename,&st); return st.st_size; } void read_blocks(FILE* file, off_t offset,int block_size){ int where = offset*block_size; fseeko(file,where,0); } FILE* fopen_safe(const char * filename){ FILE* file = fopen(filename,"r"); if(file == NULL){ printf("fatal error in random_read, open file"); exit(1); } return file; } void random_read(const char * filename){ int random; size_t size; FILE* file = fopen_safe(filename); size = look_size(filename)/block_size; random = rand()%size; read_blocks(file,random,block_size); fclose(file); } void order_read(const char *filename,int order){ size_t size; FILE* file = fopen_safe(filename); size = look_size(filename)/block_size; read_blocks(file,order%size,block_size); fclose(file); } void random_test(const char * filename,int times){ int count = 0; start_time(); while(count++<times){ // access Big File random_read(filename); } end_time(); print_time(); } void order_test(const char * filename, int times){ int count = 0; start_time(); while(count++<times){ order_read(filename,count); } end_time(); print_time(); } void random_test_diff(const char * basename,int num,int times){ int count = 0; char ** names = malloc(sizeof(char)*num); int len = strlen(basename); int i = 0; while( i < num){ names[i] = malloc(sizeof(char)*len+4); sprintf(names[i],"%s_%d",basename,i); i++; } start_time(); while(count++<times){ if(count < 900){ random_read(names[0]); } else { if(count < 960){ random_read(names[count%num]); random_read(names[(count+1)%num]); } if(count < 1000){ random_read(names[count%num]); random_read(names[(count+1)%num]); random_read(names[(count+2)%num]); } } } end_time(); print_time(); } int main(){ int i,count; int times = 1000; count = 0; // clock_t start,end; srand(time(NULL)); char * filename="bigfile"; //printf("time is %lf",((double)(end-start))/CLOCKS_PER_SEC); random_test(filename,times); order_test(filename,times); times = times; filename = "smallfile"; random_test(filename,times); order_test(filename,times); random_test_diff(filename,3,times); return 0; }

当然你这里做测试的时候可以把gettimeofday替换为clock()或者time()函数,其中clock()是程序实际运行的时间,你感觉可能执行了七八秒,其实程序只占用cpu执行了1秒.如果是time()函数或者gettimeofday()函数就是真正的运行时间也就是你感觉执行的时间.所以各有千秋,前者是为了排除各种运行环境中的软件或者os的不同,单独检测程序的效率.后者是在系统环境中程序的执行时间.

下面是我得到的结论,创建1个10G文件,创建3个2G小文件.

大文件随机访问7.199645 sec
大文件顺序访问0.013617 sec
单个小文件随机访问5.119245 sec
小文件顺序访问0.014190 sec
多个小文件的随机访问测试:90%的几率访问同一个小文件，6%访问两个小文件，4%访问3个小文件 9.791248 sec.并且90%几率发生在前,所以如果随机发生多个文件的随机访问,那么可能速度会更慢.这里慢的原因,是因为fopen.

这也就是数据库为什么只有一个数据文件的原因.有时进行文件切分,反而不利于效率的提高.但是对于100G以上的大文件,是否这个结论又要改变了呢?我没有做过这个实验.

帐前卒

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
1
评论
在ubuntu下生成大文件,并测试文件读写效率

在ubuntu或者其他的linux下生成大文件,必须使用编译参数-D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64.否则生成的文件一定是2G左右的,原因是fseek,这个函数中的offset参数是一个long型的或者是size_t型的,在fseeko中是off_t,但是这个off_t默认还是32位,只有加了-D_FILE_OFFSET_BITS=64的
复制链接

扫一扫

专栏目录