<span style="font-family: Arial, Helvetica, sans-serif; background-color: rgb(255, 255, 255);">针对10^7的数据量的磁盘文件进行排序这个问题进行</span>
<span style="font-family:Arial, Helvetica, sans-serif;"><span style="background-color: rgb(255, 255, 255);">//生成实验所需要的不重复的10^7数据</span></span>
<span style="font-family: Arial, Helvetica, sans-serif; background-color: rgb(255, 255, 255);"></span><pre name="code" class="cpp">void dataFile()
{
int n;
FILE *fp = fopen("data.txt","w");
assert(fp);
for(n=1;n<=size;n++)
{
num[n] = n;
}
srand((unsigned)time(NULL));
int i,j;
for(n=0;n<size;n++)
{
i = (rand()*RAND_MAX+rand())%10000000;
j = (rand()*RAND_MAX+rand())%10000000;
swap(num[i],num[j]);
}
for(n=0;n<size;n++)
{
fprintf(fp,"%d ",num[n]);
}
fclose(fp);
}
<span style="font-family: Arial, Helvetica, sans-serif; background-color: rgb(255, 255, 255);">
</span>
1:如果内存足够大(不现实 ),把所有数据读入内存,然后使用快速排序进行内排序,时间复杂度O(nlogn),空间复杂度O(1)
2:基于位图法.。如果数据量过大,无法一次性读入内存,就分几次进行处理。本方法的时间复杂度O(N),空间复杂度O(N)
void bitSort()
{
clock_t begin = clock();
bitset<max_each_scan> bit_map;
bit_map.reset();
FILE *fp_unsort_file = fopen("data.txt","r");
assert(fp_unsort_file);
int num;
while(fscanf(fp_unsort_file,"%d",&num)!=EOF)
{
if(num<max_each_scan)
bit_map.set(num);
}
FILE *fp_sort_file = fopen("sort.txt","w");
assert(fp_sort_file);
int i;
for(i=0;i<max_each_scan;i++)
{
if(bit_map[i] == 1)
fprintf(fp_sort_file,"%d ",i);
}
int result = fseek(fp_unsort_file,0,SEEK_SET);
if(result)
cout << "fseek failed"<<endl;
else
{
bit_map.reset();
while(fscanf(fp_unsort_file,"%d ",&num)!=EOF)
{
if(num >= max_each_scan && num < 10000000)
{
num -= max_each_scan;
bit_map.set(num);
}
}
for(i=0;i<max_each_scan;i++)
{
if(bit_map[i] == 1)
{
fprintf(fp_sort_file,"%d ",i+max_each_scan);
}
}
clock_t end = clock();
cout << "用位图方法耗时:"<<endl;
cout << (end-begin)/CLK_TCK<<"s"<<endl;
fclose(fp_unsort_file);
fclose(fp_sort_file);
}
}
3:k路归并排序。归并排序是k路归并排序的一种特殊情况。
#include <iostream>
#include <ctime>
#include <fstream>
using namespace std;
#ifndef EXTERN_SORT_H
#define EXTERN_SORT_H
#include <cassert>
#define MIN -1
#define MAX 10000000
typedef int* LoserTree;
typedef int* External;
class ExternSort
{
public :
void sort_tree()
{
//time_t start = time(NULL);
clock_t begin = clock();
//将文件内容分块在内存中排序,并分别写入临时文件
//int file_count = memory_sort();
//归并临时文件内容到输出文件
//merge_sort(file_count);
k = memory_sort();
ls=new int[k];
b = new int[k+1];
k_merge();
delete[] ls;
delete[] b;
clock_t end = clock();
//time_t end = time(NULL);
cout << "total time:"<<(end-begin)/CLK_TCK<<"s"<<endl;
//cout << "total time:"<<(end-start)*1000/CLOCKS_PER_SEC << endl;
}
void sort()
{
//time_t start = time(NULL);
clock_t begin = clock();
//将文件内容分块在内存中排序,并分别写入临时文件
int file_count = memory_sort();
//归并临时文件内容到输出文件
merge_sort(file_count);
clock_t end = clock();
//time_t end = time(NULL);
cout << "total time:"<<(end-begin)/CLK_TCK<<"s"<<endl;
//cout << "total time:"<<(end-start)*1000/CLOCKS_PER_SEC << endl;
}
ExternSort(const char *input_file,const char *out_file,int count)
{
m_count = count;
m_in_file = new char[strlen(input_file)+1];
strcpy(m_in_file,input_file);
m_out_file = new char[strlen(out_file)+1];
strcpy(m_out_file,out_file);
}
virtual ~ExternSort()
{
delete[] m_in_file;
delete[] m_out_file;
}
private:
int m_count;//数组长度
char *m_in_file;
char *m_out_file;
int k;//归并路数
LoserTree ls;//定义为指针之后,动态生成数组
External b;//定义为指针之后,动态生成数组
protected:
int read_data(FILE *f,int a[],int n)
{
int i =0;
while(i<n&&(fscanf(f,"%d ",&a[i]))!=EOF)
i++;
cout << "read:" << i << "integer" << endl;
return i;
}
void write_data(FILE *f,int a[],int n)
{
for(int i=0;i<n;++i)
fprintf(f,"%d ",a[i]);
}
char* temp_filename(int index)
{
char *tempfile = new char[100];
sprintf(tempfile,"temp%d.txt",index);
return tempfile;
}
static int cmp_int(const void *a,const void *b)
{
return *(int*)a-*(int*)b;
}
int memory_sort()
{
FILE *fin = fopen(m_in_file,"rt");
int n=0,file_count=0;
int *array = new int[m_count];
//每次读入m_count个整数在内存中做一次排序,并写入临时文件
while((n=read_data(fin,array,m_count))>0)
{
qsort(array,n,sizeof(int),cmp_int);
char *fileName = temp_filename(file_count++);
FILE *tempFile = fopen(fileName,"w");
free(fileName);
write_data(tempFile,array,n);
fclose(tempFile);
}
delete[] array;
fclose(fin);
return file_count;
}
//构造败者树
void CreateLoserTree()
{
b[k] = MIN;//额外存储一个最小值
for(int i=0;i<k;i++)
ls[i] = k;//初始化指向最小值
for(int i=k-1;i>=0;i--)
Adjust(i);//依次调整败者树
}
void Adjust(int s)
{//沿从叶子节点b[s]到根节点ls[0]的路径调整败者树
int t = (s+k)/2;//ls[t]是b[s]的双亲节点
while(t>0)
{
if(b[s]>b[ls[t]])//如果失败,则失败者位置s留下,s指向新的胜利者
{
int tmp =s;
s =ls[t];
ls[t] =tmp;
}
t=t/2;
}
ls[0] = s;
}
//基于败者树的k路归并排序
void k_merge()
{
if(k <= 0)
return;
FILE *fout = fopen(m_out_file,"w");
FILE* *farray = new FILE*[k];
int i;
for(i=0;i<k;++i)
{
char* fileName = temp_filename(i);
farray[i] = fopen(fileName,"rt");
free(fileName);
}
for(i=0;i<k;++i)
{
if(fscanf(farray[i],"%d ",&b[i]) == EOF)
{
printf("there is no %d file to merge!",k);
return;
}
}
CreateLoserTree();
int q;
while(b[ls[0]] !=MAX)
{
q=ls[0];//q用来存放b中的最小值
fprintf(fout,"%d ",b[q]);
if(fscanf(farray[q],"%d ",&b[q])==EOF)
b[q] = MAX;
Adjust(q);
}
fprintf(fout,"%d ",b[ls[0]]);
for(i=0;i<k;++i)
{
fclose(farray[i]);
}
delete[] farray;
fclose(fout);
}
void merge_sort(int file_count)
{
if(file_count <= 0) return;
//归并临时文件
FILE *fout = fopen(m_out_file, "wt");
FILE* *farray = new FILE*[file_count];
int i;
for(i = 0; i < file_count; ++i)
{
char* fileName = temp_filename(i);
farray[i] = fopen(fileName, "rt");
free(fileName);
}
int *data = new int[file_count];//存储每个文件当前的一个数字
bool *hasNext = new bool[file_count];//标记文件是否读完
memset(data, 0, sizeof(int) * file_count);
memset(hasNext, 1, sizeof(bool) * file_count);
for(i = 0; i < file_count; ++i) //初始读取
{
if(fscanf(farray[i], "%d", &data[i]) == EOF)//读每个文件的第一个数到data数组
hasNext[i] = false;
}
while(true) //循环读取和输出,选择最小数的方法是简单遍历选择法
{
//求data中可用的最小的数字,并记录对应文件的索引
int min = data[0];
int j = 0;
while (j < file_count && !hasNext[j]) //顺序跳过已读取完毕的文件
j++;
if (j >= file_count) //没有可取的数字,终止归并
break;
for(i = j +1; i < file_count; ++i) //选择最小数,这里应该是i=j吧!但结果是一样的!
{
if(hasNext[i] && min > data[i])
{
min = data[i];
j = i;
}
}
if(fscanf(farray[j], "%d", &data[j]) == EOF) //读取文件的下一个元素
hasNext[j] = false;
fprintf(fout, "%d ", min);
}
delete [] hasNext;
delete [] data;
for(i = 0; i < file_count; ++i)
{
fclose(farray[i]);
}
delete [] farray;
fclose(fout);
}
};
#endif