<span style="font-family: Arial, Helvetica, sans-serif;">当你有20M的内存,想对200M的数据进行排序的时候,就需要用到外排序。
参考自:http://www.cnblogs.com/benjamin-t/p/3325401.html
因为看不懂败者树,所以我用堆排序来做
1.不要使用 ifstream::eof()来判断输入流结尾。这是个坑
2.建堆操作O(n * logn),调整O(logn).能调整的时候务必调整。效率影响无比之大
</span>
<span style="font-family: Arial, Helvetica, sans-serif;">#include <iostream></span>
#include <algorithm>
#include <fstream>
#include <ctime>
#include <functional>
#include <vector>
using namespace std;
#define MEMORY 20000 //内存容量
#define MAXNUM 100000 //随机数总数
int buffer[MEMORY];
int a[MAXNUM];
void adjust_heap(int a[],int hole,int size)
{
while(hole < size)
{
int lchild = hole * 2 + 1;
int rchild = hole * 2 + 2;
int min = hole;
if(lchild < size && a[lchild] < a[hole])
min = lchild;
if(rchild < size && a[rchild] < a[min])
min = rchild;
if(min == hole)
break;
else
{
int t = a[min];
a[min] = a[hole];
a[hole] =t;
hole = min;
}
}
}
void generate_num()
{
srand(unsigned(time(NULL)));
for(auto i = 0;i < MAXNUM;++i)
{
a[i] = i + 1;
}
for(auto i = 0;i < MAXNUM;++i)
{
auto j = rand()%MAXNUM;
auto t = a[i];
a[i] = a[j];
a[j] = t;
}
ofstream out("data");
for(auto i = 0;i <MAXNUM;++i)
{
out << a[i] << endl;
}
out.close();
out.open("sort_data");
sort(a,a+MAXNUM);
for(auto i = 0;i <MAXNUM;++i)
{
out << a[i] << endl;
}
out.close();
}
int generate_runs(const char* data_file)
{
ifstream in(data_file);
ofstream out;
int i = 0;
int file_count = 0;
char filename[20];
while(i < MEMORY && in >> buffer[i++])
;
// buffer 满 (假设第一遍肯定读满)
if(i == MEMORY)
{
int size = MEMORY;
while(1)
{
sprintf(filename,"%d",file_count++);
out.open(filename);
make_heap(buffer,buffer+size,greater<int>());
while(size > 0)
{
int t;
if(in >> t)
{
out << buffer[0] << endl;
//cout << buffer[0] << endl; //
if(t < buffer[0])
{
// 属于下一个顺串
buffer[0] = buffer[--size];
buffer[size] = t;
//make_heap(buffer,buffer + size,greater<int>());
adjust_heap(buffer,0,size);
}
else
{
// 属于当前顺串
buffer[0] = t;
//make_heap(buffer,buffer + size,greater<int>());
adjust_heap(buffer,0,size);
}
}
else
{
break;
}
}
// 说明该顺串输出完毕
if(size == 0)
{
out.close();
size = MEMORY;
}
// 文件读完,处理剩余的数据
else
{
int offset = size;
int lsize = MEMORY - size;
// 1.继续输出前面部分
while(size > 0)
{
//make_heap(buffer,buffer + size,greater<int>());
adjust_heap(buffer,0,size);
out << buffer[0] << endl;
//cout << buffer[0] << endl; //
buffer[0] = buffer[size - 1];
--size;
}
out.close();
// 判断是否还需要创建新文件
if(lsize > 0)
{
sprintf(filename,"%d",file_count++);
out.open(filename);
// 在这里建一次堆。下面只需要调整
make_heap(buffer + offset,buffer + offset + lsize,greater<int>());
}
// 2.输出后面部分到一个顺串
while(lsize > 0)
{
adjust_heap(buffer + offset,0,lsize);
out << buffer[offset] << endl;
//cout << buffer[offset] << endl; //
buffer[offset] = buffer[offset + lsize - 1];
--lsize;
}
out.close();
break;
}
}
}
return file_count;
}
void merger_sort(int runs,char* out_file)
{
//ifstream in[100];
ifstream* in = new ifstream[runs];
ofstream out(out_file);
int loser = -1;
// 每个run都能提供输出
int *backup = new int[runs];
vector<int> buffer;
char filename[20];
// 打开文件
for(int i = 0;i < runs;++i)
{
sprintf(filename,"%d",i);
in[i].open(filename);
}
// 开工
// 预先往buffer中加入runs个数据
for(int i = 0;i < runs;++i)
{
in[i] >> backup[i];
buffer.push_back(backup[i]);
}
while(buffer.size() > 0)
{
make_heap(buffer.begin(),buffer.end(),greater<int>());
out << buffer[0] << endl;
// 找出buffer[0] 所属的输入流
for(int i = 0;i < runs;++i)
{
if(buffer[0] == backup[i])
{
loser = i;
buffer[0] = buffer[buffer.size()-1];
buffer.pop_back();
break;
}
}
// if(in[loser].eof())
// {
// loser = -1;
// }
// if(loser != -1)
// {
// int t;
// in[loser] >> t;
// backup[loser] = t;
// buffer.push_back(t);
// }
int t;
if(! (in[loser] >> t))
{
loser = -1;
}
else
{
backup[loser] = t;
buffer.push_back(t);
}
}
//清理
for(int i = 0;i < runs;++i)
{
in[i].close();
}
out.close();
delete []in;
delete []backup;
}
bool check()
{
ifstream in1("merge");
ifstream in2("sort_data");
int a,b;
while(in1 >> a && in2 >> b)
{
if(a != b)
return false;
}
return true;
}
int main()
{
auto begin = time(NULL);
cout << "开始产生随机数" << endl;
generate_num();
cout << "随机数产生完毕,花费时间 " << time(NULL) - begin << endl;
cout << "归并排序开始" << endl;
begin = time(NULL);
int runs = generate_runs("data");
merger_sort(runs,"merge");
cout << "归并排序完毕,花费时间 " << time(NULL) - begin << endl;
cout << "检查结果为" << (check() ? "正确" : "错误" )<< endl;
}