如何给10^7个数据量的磁盘文件排序

//purpose:  生成随机的不重复的测试数据  
//1000w数据量,要保证生成不重复的数据量,一般的程序没有做到。但,本程序做到了。  
#include <time.h>  
#include <assert.h>  
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <bitset> 
#include <iostream>  
#include <time.h>
#define BITSPERWORD 32
#define SHIFT 5
#define MASK 0x1F
#define N_SIZE	10000000 
const unsigned int CLK_TCK = 1000;
using namespace std; 
int random_num[N_SIZE] = {0};  
const int max_each_scan = N_SIZE/2;
unsigned char *g_bitmap = NULL;    
static int g_size = 0;    
static int g_base = 0; 
int qsort_num[N_SIZE] = {0};
int sort_num[N_SIZE] = {0};
int sort2_num[N_SIZE] = {0};

void swap(int* a, int* b)//第一种,也是最常用的一种
{
	int t;
	t=*a;
	*a=*b;
	*b=t;
}
void create_data_file()
{
	int n;
	int i, j;  
    FILE *fp = fopen("data.txt", "w");  
    assert(fp);  
    for (n = 0; n < N_SIZE; n++)    //之前此处写成了n=0;n<size。导致下面有一段小程序的测试数据出现了0,特此订正。  
        random_num[n] = n+1;  
    srand((unsigned)time(NULL));  
     
	printf("random_num init ok\n");
	
	for (i = N_SIZE - 1; i >= 1; --i)
	{
		swap(&random_num[i], &random_num[rand() % i]);
	}
  
    for (n = 0; n < N_SIZE; n++)  
        fprintf(fp, "%d ", random_num[n]);  
    fclose(fp);  
}  

  
int bitmap_init(int size, int start)    
{    
    g_bitmap = (unsigned char *)malloc((size/8+1)*sizeof(char));    
    if(g_bitmap == NULL)    
        return 0;    
    g_base = start;    
    g_size = size/8+1;    
    memset(g_bitmap, 0x0, g_size);    
    return 1;    
}    
void bitmap_clr()
{
	if(NULL != g_bitmap && g_size != 0)
		memset(g_bitmap, 0x0, g_size);
}    
int bitmap_set(int index)    
{    
    int quo = (index-g_base)/8 ;    
    int remainder = (index-g_base)%8;    
    unsigned char x = (0x1<<remainder);    
    if( quo > g_size)    
        return 0;    
    g_bitmap[quo] |= x;    
    return 1;     
}    
    
int bitmap_get(int i)    
{    
    int quo = (i)/8 ;    
    int remainder = (i)%8;    
    unsigned char x = (0x1<<remainder);    
    unsigned char res;    
    if( quo > g_size)    
        return -1;    
    res = g_bitmap[quo] & x;    
    return res > 0 ? 1 : 0;     
}    
    
int bitmap_data(int index)    
{    
    return (index + g_base);    
}    
    
int bitmap_free()    
{    
    free(g_bitmap); 
	g_bitmap = NULL;
	g_size = 0;    
	g_base = 0;
} 

void my_sort_num222()
{
	FILE *fp_unsort_file = NULL;
	int i;
	int num; 
	clock_t begin = clock(); 
	bitmap_init(max_each_scan,0);
	
	fp_unsort_file = fopen("data.txt", "r");  
    assert(fp_unsort_file);  
     
    // the first time scan to sort the data between 0 - 4999999  
    while (fscanf(fp_unsort_file, "%d ", &num) != EOF)  
    {  
        if (num < max_each_scan)  
            bitmap_set(num); 
    } 
      
    // write the sorted data into file  
    for (i = 0; i < max_each_scan; i++)  
    {  
        if (bitmap_get(i) == 1)   
			sort2_num[i] = i;
    }  
      
    // the second time scan to sort the data between 5000000 - 9999999  
    int result = fseek(fp_unsort_file, 0, SEEK_SET);  
    if (result)  
        printf("fseek failed!\n");  
    else  
    {  
        bitmap_clr();
        while (fscanf(fp_unsort_file, "%d ", &num) != EOF)  
        {  
            if (num >= max_each_scan && num < 10000000)  
            {  
                num -= max_each_scan;  
                bitmap_set(num); 
            }  
        }  
        for (i = 0; i < max_each_scan; i++)  
        {  
            if (bitmap_get(i) == 1)  
				sort2_num[i + max_each_scan] = i + max_each_scan;
        }  
    }
    fclose(fp_unsort_file);  
	clock_t end = clock(); 
	printf("sort2 consum %d ms\n",(end - begin) / CLK_TCK);;
}

void create_sort_file222()
{
	FILE *fp_sort_file = NULL;
	FILE *fp_unsort_file = NULL;
	int i;
	int num; 
	bitmap_init(max_each_scan,0);
	
	fp_unsort_file = fopen("data.txt", "r");  
    assert(fp_unsort_file);  
     
    // the first time scan to sort the data between 0 - 4999999  
    while (fscanf(fp_unsort_file, "%d ", &num) != EOF)  
    {  
        if (num < max_each_scan)  
            bitmap_set(num); 
    } 

	fp_sort_file = fopen("sort2.txt", "w");  
    assert(fp_sort_file);  
      
      
    // write the sorted data into file  
    for (i = 0; i < max_each_scan; i++)  
    {  
        if (bitmap_get(i) == 1)  
            fprintf(fp_sort_file, "%d ", i); 
    }  
      
    // the second time scan to sort the data between 5000000 - 9999999  
    int result = fseek(fp_unsort_file, 0, SEEK_SET);  
    if (result)  
        printf("fseek failed!\n");  
    else  
    {  
        bitmap_clr();
        while (fscanf(fp_unsort_file, "%d ", &num) != EOF)  
        {  
            if (num >= max_each_scan && num < 10000000)  
            {  
                num -= max_each_scan;  
                bitmap_set(num); 
            }  
        }  
        for (i = 0; i < max_each_scan; i++)  
        {  
            if (bitmap_get(i) == 1)  
                fprintf(fp_sort_file, "%d ", i + max_each_scan); 
        }  
    }
	fclose(fp_sort_file);  
    fclose(fp_unsort_file);  
}

void my_sort_num()
{
	FILE *fp_unsort_file = NULL;
	int i;
	int num; 
	clock_t begin = clock(); 
	bitset<max_each_scan> bit_map;  
    bit_map.reset(); 
	
	fp_unsort_file = fopen("data.txt", "r");  
    assert(fp_unsort_file);  
     

    // the first time scan to sort the data between 0 - 4999999  
    while (fscanf(fp_unsort_file, "%d ", &num) != EOF)  
    {  
        if (num < max_each_scan)  
			bit_map.set(num, 1);
    }    
      
    // write the sorted data into file  
    for (i = 0; i < max_each_scan; i++)  
	{
		if (bit_map[i] == 1)
			sort_num[i] = i;
    }  
      
    // the second time scan to sort the data between 5000000 - 9999999  
    int result = fseek(fp_unsort_file, 0, SEEK_SET);  
    if (result)  
        printf("fseek failed!\n");  
    else  
    {  
        bit_map.reset(); 
        while (fscanf(fp_unsort_file, "%d ", &num) != EOF)  
        {  
            if (num >= max_each_scan && num < 10000000)  
            {  
                num -= max_each_scan;  
				bit_map.set(num, 1);
            }  
        }  
        for (i = 0; i < max_each_scan; i++)  
        {  
            if (bit_map[i] == 1) 
                sort_num[i + max_each_scan] = i + max_each_scan;  
        }  
    }
    fclose(fp_unsort_file);  
	clock_t end = clock(); 
	printf("sort1 consum %d ms\n",(end - begin) / CLK_TCK);;
}

void create_sort_file()
{
	FILE *fp_sort_file = NULL;
	FILE *fp_unsort_file = NULL;
	int i;
	int num; 
	bitset<max_each_scan> bit_map;  
    bit_map.reset(); 
	
	fp_unsort_file = fopen("data.txt", "r");  
    assert(fp_unsort_file);  
     

    // the first time scan to sort the data between 0 - 4999999  
    while (fscanf(fp_unsort_file, "%d ", &num) != EOF)  
    {  
        if (num < max_each_scan)  
			bit_map.set(num, 1);
    } 

	fp_sort_file = fopen("sort.txt", "w");  
    assert(fp_sort_file);  
      
      
    // write the sorted data into file  
    for (i = 0; i < max_each_scan; i++)  
	{
		if (bit_map[i] == 1)  
            fprintf(fp_sort_file, "%d ", i); 
    }  
      
    // the second time scan to sort the data between 5000000 - 9999999  
    int result = fseek(fp_unsort_file, 0, SEEK_SET);  
    if (result)  
        printf("fseek failed!\n");  
    else  
    {  
        bit_map.reset(); 
        while (fscanf(fp_unsort_file, "%d ", &num) != EOF)  
        {  
            if (num >= max_each_scan && num < 10000000)  
            {  
                num -= max_each_scan;  
				bit_map.set(num, 1);
            }  
        }  
        for (i = 0; i < max_each_scan; i++)  
        {  
            if (bit_map[i] == 1) 
                fprintf(fp_sort_file, "%d ", i + max_each_scan);  
        }  
    }
	fclose(fp_sort_file);  
    fclose(fp_unsort_file);  
}

int intcomp(int *x,int *y){    
    return *x-*y;    
}

int myintcomp(const void *x, const void *y)
{
	int *m = (int*)x;
	int *n = (int*)y;
	return *m - *n;
}
void my_qsort_num()
{
	FILE *fp_unsort_file = NULL;
	int i=0;
	int num; 
	clock_t begin = clock(); 
	
	fp_unsort_file = fopen("data.txt", "r");  
    assert(fp_unsort_file);  
     

    // the first time scan to sort the data between 0 - 4999999  
    while (fscanf(fp_unsort_file, "%d ", &num) != EOF)  
    {  
		qsort_num[i++] = num;
    }    
    qsort((void*)qsort_num,N_SIZE,sizeof(int),myintcomp);  
    fclose(fp_unsort_file);  
	clock_t end = clock(); 
	printf("qsort consum %d ms\n",(end - begin) / CLK_TCK);;
}
int main()  
{  
    //create_data_file();
	//create_sort_file();
	//create_sort_file222();
	int x = 5, y = 4,z = 5,q = 6;
	printf("intcomp=%d\n",myintcomp((const void *)&x, (const void *)&y));
	printf("intcomp=%d\n",myintcomp((const void *)&x, (const void *)&z));
	printf("intcomp=%d\n",myintcomp((const void *)&x, (const void *)&q));
	printf("intcomp=%d\n",intcomp(&x, &y));
	printf("intcomp=%d\n",intcomp(&x, &z));
	printf("intcomp=%d\n",intcomp(&x, &q));
	
	my_sort_num222();
	my_qsort_num();
	my_sort_num();
	
    return 0;  
} 

第一节、如何给磁盘文件排序
问题描述:
输入:一个最多含有n个不重复的正整数(也就是说可能含有少于n个不重复正整数)的文件,其中每个数都小于等于n,且n=10^7。
输出:得到按从小到大升序排列的包含所有输入的整数的列表。
条件:最多有大约1MB的内存空间可用,但磁盘空间足够。且要求运行时间在5分钟以下,10秒为最佳结果。

分析:下面咱们来一步一步的解决这个问题,
    1、归并排序。你可能会想到把磁盘文件进行归并排序,但题目要求你只有1MB的内存空间可用,所以,归并排序这个方法不行。
    2、位图方案。熟悉位图的朋友可能会想到用位图来表示这个文件集合。例如正如编程珠玑一书上所述,用一个20位长的字符串来表示一个所有元素都小于20的简单的非负整数集合,边框用如下字符串来表示集合{1,2,3,5,8,13}:

0 1 1 1 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0

上述集合中各数对应的位置则置1,没有对应的数的位置则置0。

    参考编程珠玑一书上的位图方案,针对我们的10^7个数据量的磁盘文件排序问题,我们可以这么考虑,由于每个7位十进制整数表示一个小于1000万的整数。我们可以使用一个具有1000万个位的字符串来表示这个文件,其中,当且仅当整数i在文件中存在时,第i位为1。采取这个位图的方案是因为我们面对的这个问题的特殊性:1、输入数据限制在相对较小的范围内,2、数据没有重复,3、其中的每条记录都是单一的整数,没有任何其它与之关联的数据。
    所以,此问题用位图的方案分为以下三步进行解决:

  • 第一步,将所有的位都置为0,从而将集合初始化为空。
  • 第二步,通过读入文件中的每个整数来建立集合,将每个对应的位都置为1。
  • 第三步,检验每一位,如果该位为1,就输出对应的整数。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值