对10^7数据量的磁盘文件排序

<span style="font-family: Arial, Helvetica, sans-serif; background-color: rgb(255, 255, 255);">针对10^7的数据量的磁盘文件进行排序这个问题进行</span>
<span style="font-family:Arial, Helvetica, sans-serif;"><span style="background-color: rgb(255, 255, 255);">//生成实验所需要的不重复的10^7数据</span></span>
<span style="font-family: Arial, Helvetica, sans-serif; background-color: rgb(255, 255, 255);"></span><pre name="code" class="cpp">void dataFile()
{
	int n;
	FILE *fp = fopen("data.txt","w");
	assert(fp);

	for(n=1;n<=size;n++)
	{
		num[n] = n;
	}
	srand((unsigned)time(NULL));
	int i,j;
	for(n=0;n<size;n++)
	{
		i = (rand()*RAND_MAX+rand())%10000000;
		j = (rand()*RAND_MAX+rand())%10000000;
		swap(num[i],num[j]);
	}

	for(n=0;n<size;n++)
	{
		fprintf(fp,"%d ",num[n]);
	}
	fclose(fp);
}

 
<span style="font-family: Arial, Helvetica, sans-serif; background-color: rgb(255, 255, 255);">
</span>


1:如果内存足够大(不现实 ),把所有数据读入内存,然后使用快速排序进行内排序,时间复杂度O(nlogn),空间复杂度O(1)


2:基于位图法.。如果数据量过大,无法一次性读入内存,就分几次进行处理。本方法的时间复杂度O(N),空间复杂度O(N)

void bitSort()
{
	clock_t begin = clock();
	bitset<max_each_scan> bit_map;
	bit_map.reset();

	FILE *fp_unsort_file = fopen("data.txt","r");
	assert(fp_unsort_file);
	int num;

	while(fscanf(fp_unsort_file,"%d",&num)!=EOF)
	{
		if(num<max_each_scan)
			bit_map.set(num);
	}

	FILE *fp_sort_file = fopen("sort.txt","w");
	assert(fp_sort_file);
	int i;

	for(i=0;i<max_each_scan;i++)
	{
		if(bit_map[i] == 1)
			fprintf(fp_sort_file,"%d ",i);
	}

	int result = fseek(fp_unsort_file,0,SEEK_SET);
	if(result)
		cout << "fseek failed"<<endl;
	else
	{
		bit_map.reset();
		while(fscanf(fp_unsort_file,"%d ",&num)!=EOF)
		{
			if(num >= max_each_scan && num < 10000000)
			{
				num -= max_each_scan;
				bit_map.set(num);
			}
		}

		for(i=0;i<max_each_scan;i++)
		{
			if(bit_map[i] == 1)
			{
				fprintf(fp_sort_file,"%d ",i+max_each_scan);
			}
		}

		clock_t end = clock();
		cout << "用位图方法耗时:"<<endl;
		cout << (end-begin)/CLK_TCK<<"s"<<endl;
		fclose(fp_unsort_file);
		fclose(fp_sort_file);
	}
}

3:k路归并排序。归并排序是k路归并排序的一种特殊情况。

#include <iostream>
#include <ctime>
#include <fstream>
using namespace std;

#ifndef EXTERN_SORT_H
#define EXTERN_SORT_H

#include <cassert>

#define MIN -1
#define MAX 10000000
typedef int* LoserTree;
typedef int* External;

class ExternSort
{
public :
	void sort_tree()
	{
		//time_t start = time(NULL);
		clock_t begin = clock();
		//将文件内容分块在内存中排序,并分别写入临时文件
		//int file_count = memory_sort();
		//归并临时文件内容到输出文件
		//merge_sort(file_count);
		k = memory_sort();
		ls=new int[k];
		b = new int[k+1];
		k_merge();
		delete[] ls;
		delete[] b;
		clock_t end = clock();
		//time_t end = time(NULL);
		cout << "total time:"<<(end-begin)/CLK_TCK<<"s"<<endl;
		//cout << "total time:"<<(end-start)*1000/CLOCKS_PER_SEC << endl;
	}

	void sort()
	{
		//time_t start = time(NULL);
		clock_t begin = clock();
		//将文件内容分块在内存中排序,并分别写入临时文件
		int file_count = memory_sort();
		//归并临时文件内容到输出文件
		merge_sort(file_count);
		clock_t end = clock();
		//time_t end = time(NULL);
		cout << "total time:"<<(end-begin)/CLK_TCK<<"s"<<endl;
		//cout << "total time:"<<(end-start)*1000/CLOCKS_PER_SEC << endl;
	}

	ExternSort(const char *input_file,const char *out_file,int count)
	{
		m_count = count;
		m_in_file = new char[strlen(input_file)+1];
		strcpy(m_in_file,input_file);
		m_out_file = new char[strlen(out_file)+1];
		strcpy(m_out_file,out_file);
	}

	virtual ~ExternSort()
	{
		delete[] m_in_file;
		delete[] m_out_file;
	}

private:
	int m_count;//数组长度
	char *m_in_file;
	char *m_out_file;
	int k;//归并路数
	LoserTree ls;//定义为指针之后,动态生成数组 
	External b;//定义为指针之后,动态生成数组
protected:
	int read_data(FILE *f,int a[],int n)
	{
		int i =0;
		while(i<n&&(fscanf(f,"%d ",&a[i]))!=EOF)
			i++;
		cout << "read:" << i << "integer" << endl;
		return i;
	}

	void write_data(FILE *f,int a[],int n)
	{
		for(int i=0;i<n;++i)
			fprintf(f,"%d ",a[i]);
	}

	char* temp_filename(int index)
	{
		char *tempfile = new char[100];
		sprintf(tempfile,"temp%d.txt",index);
		return tempfile;
	}

	static int cmp_int(const void *a,const void *b)
	{
		return *(int*)a-*(int*)b;
	}

	int memory_sort()
	{
		FILE *fin = fopen(m_in_file,"rt");
		int n=0,file_count=0;
		int *array = new int[m_count];

		//每次读入m_count个整数在内存中做一次排序,并写入临时文件
		while((n=read_data(fin,array,m_count))>0)
		{
			qsort(array,n,sizeof(int),cmp_int);
			char *fileName = temp_filename(file_count++);
			FILE *tempFile = fopen(fileName,"w");
			free(fileName);
			write_data(tempFile,array,n);
			fclose(tempFile);
		}
		delete[] array;
		fclose(fin);
		return file_count;
	}

	//构造败者树
	void CreateLoserTree()
	{
		b[k] = MIN;//额外存储一个最小值
		for(int i=0;i<k;i++)
			ls[i] = k;//初始化指向最小值

		for(int i=k-1;i>=0;i--)
			Adjust(i);//依次调整败者树
	}

	void Adjust(int s)
	{//沿从叶子节点b[s]到根节点ls[0]的路径调整败者树
		int t = (s+k)/2;//ls[t]是b[s]的双亲节点
		while(t>0)
		{
			if(b[s]>b[ls[t]])//如果失败,则失败者位置s留下,s指向新的胜利者
			{
				int tmp =s;
				s =ls[t];
				ls[t] =tmp;
			}
			t=t/2;
		}
		ls[0] = s;
	}

	//基于败者树的k路归并排序
	void k_merge()
	{
		if(k <= 0)
			return;

		FILE *fout = fopen(m_out_file,"w");
		FILE* *farray = new FILE*[k];
		int i;
		for(i=0;i<k;++i)
		{
			char* fileName = temp_filename(i);
			farray[i] = fopen(fileName,"rt");
			free(fileName);
		}

		

		for(i=0;i<k;++i)
		{
			if(fscanf(farray[i],"%d ",&b[i]) == EOF)
			{
				 printf("there is no %d file to merge!",k);
				 return;
			}
		}

		CreateLoserTree();
		int q;
		while(b[ls[0]] !=MAX)
		{
			q=ls[0];//q用来存放b中的最小值
			fprintf(fout,"%d ",b[q]);
			if(fscanf(farray[q],"%d ",&b[q])==EOF)
				b[q] = MAX;
			Adjust(q);
		}

		fprintf(fout,"%d ",b[ls[0]]);

		for(i=0;i<k;++i)
		{
			fclose(farray[i]);
		}
		delete[] farray;
		fclose(fout);
	}

	void merge_sort(int file_count)  
	{  
		if(file_count <= 0) return;  

		//归并临时文件  
		FILE *fout = fopen(m_out_file, "wt");  
		FILE* *farray = new FILE*[file_count];  
		int i;  
		for(i = 0; i < file_count; ++i)  
		{  
			char* fileName = temp_filename(i);  
			farray[i] = fopen(fileName, "rt");  
			free(fileName);  
		}  

		int *data = new int[file_count];//存储每个文件当前的一个数字  
		bool *hasNext = new bool[file_count];//标记文件是否读完  
		memset(data, 0, sizeof(int) * file_count);  
		memset(hasNext, 1, sizeof(bool) * file_count);  

		for(i = 0; i < file_count; ++i)  //初始读取
		{  
			if(fscanf(farray[i], "%d", &data[i]) == EOF)//读每个文件的第一个数到data数组  
				hasNext[i] = false;  
		}  

		while(true)  //循环读取和输出,选择最小数的方法是简单遍历选择法
		{  
			//求data中可用的最小的数字,并记录对应文件的索引  
			int min = data[0];  
			int j = 0;  

			while (j < file_count && !hasNext[j])  //顺序跳过已读取完毕的文件
				j++;  

			if (j >= file_count)  //没有可取的数字,终止归并  
				break;  


			for(i = j +1; i < file_count; ++i)  //选择最小数,这里应该是i=j吧!但结果是一样的!
			{  
				if(hasNext[i] && min > data[i])  
				{  
					min = data[i];  
					j = i;  
				}  
			}  

			if(fscanf(farray[j], "%d", &data[j]) == EOF) //读取文件的下一个元素  
				hasNext[j] = false;  
			fprintf(fout, "%d ", min);  

		}  

		delete [] hasNext;  
		delete [] data;  

		for(i = 0; i < file_count; ++i)  
		{  
			fclose(farray[i]);  
		}  
		delete [] farray;  
		fclose(fout);  
	}  
};

#endif


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值