External Sorting
利用输者树生成顺串模拟磁盘的文件、实现归并。其中输入、输出缓冲区、归并路数可自设。
输者树参考:败方树(输者树)的建立
#include<iostream>
#include<fstream>
#include<vector>
#include<string>
#include<queue>
#include<stdlib.h>
#include<time.h>
using namespace std;
int DISK_NUM = 0; //硬盘访问总次数
const int INF = 0x3f3f3f;
struct Sequece {
int val;
int num;
bool operator<= (const Sequece &a) {
if (num != a.num)
return num <= a.num;
return val <= a.val;
}
bool operator> (const Sequece &a) {
if (num != a.num)
return num > a.num;
return val > a.val;
}
Sequece() {
val = INF;
num = INF;
}
};
struct File {
string filename; //文件名
queue<int> data; //一个文件中的所有数据
int fileLength; //文件的长度(大小)
int bufferIndex; //每个文件的缓冲区位置的索引
int *buffer; //属于每个文件自己的缓冲区
void initData() {
ifstream temp(filename);
int t;
while (temp >> t)
data.push(t);
fileLength = data.size();
temp.close();
}
void initBuffer(int sizePerBuffer) {
buffer = new int[sizePerBuffer];
bufferIndex = 0;
for (int i = 0; i < sizePerBuffer; i++) {
if (!data.empty()) {
int temp = data.front();
data.pop();
buffer[i] = temp;
}
else
buffer[i] = INF;
}
}
friend bool operator< (const File &a, const File &b) {
return a.fileLength < b.fileLength;
}
};
template<class T>
class loserTree
{
private:
int *tree; //输者树
T *player; //外部节点
int *temp; //每次得到的赢家
int numOfPlayer; //外部节点数量
int lowExt; //最底层外部节点数,内部节点数的2倍
int offset; //若是满二叉树时,所有内部节点的数量
int winner(int x, int y) { return player[x] <= player[y] ? x : y; }
int loser(int x, int y) { return player[x] > player[y] ? x : y; }
void play(int gamepoint, int leftPlayer, int rightPlayer);
public:
loserTree(int n) {
tree = nullptr, temp = nullptr;
if (n < 2) { return; }
tree = new int[n + 1];
temp = new int[n + 1];
numOfPlayer = n;
}
void output() {
cout << "test:" << endl;
for (int i = 0; i < numOfPlayer; i++)
{
cout << player[tree[i]].val << ":" << player[tree[i]].num << " ";
}
cout << endl;
for (int i = 0; i < numOfPlayer; i++)
cout << tree[i] << " ";
cout << endl;
for (int i = 1; i <= numOfPlayer; i++)
cout << player[i].val << " ";
cout << endl;
}
void initialize(T *thePlayers, int n);
void replay(int thePlayer, T value);
int theWinner() { return tree[0]; }
};
template<class T>
void loserTree<T>::initialize(T *thePlayers, int n)
{
numOfPlayer = n;
player = thePlayers;
int i, s; //s表示树最底层第一个节点的数组编号 s = 2^log (n-1)
for (s = 1; 2 * s <= n - 1; s *= 2) ;
lowExt = 2 * (n - s); //最底层连接的外部节点数
offset = 2 * s - 1; //满二叉树时的节点数
//最底层外部节点的比赛
for (i = 2; i <= lowExt; i += 2)
play((i + offset) / 2, i - 1, i);
//处理其余外部节点
if (n % 2 == 1) {
//当n奇数时,内部节点和外部节点的比赛
play(n / 2, temp[n - 1], lowExt + 1);
i = lowExt + 3;
}
else
i = lowExt + 2;
//i为最左剩余节点
for (; i <= n; i += 2)
play((i - lowExt + n - 1) / 2, i - 1, i);
//记录输者树的最终赢者
tree[0] = temp[1];
}
template<class T>
void loserTree<T>::play(int p, int left, int right)
{
tree[p] = loser(left, right); //记录输者
temp[p] = winner(left, right); //记录赢者
//若在右孩子处可能有多场比赛
while (p % 2 == 1 && p > 1)
{
tree[p / 2] = loser(temp[p - 1], temp[p]);
temp[p / 2] = winner(temp[p - 1], temp[p]);
p /= 2; //向上走
}
}
template<class T>
void loserTree<T>::replay(int thePlayer, T newValue)
{
player[thePlayer] = newValue; //更新值
int matchPoint, left, right; //比赛的节点,该节点的左、右孩子
//找到第一个比赛节点及其子女
if (thePlayer <= lowExt) //从最底层开始
matchPoint = (thePlayer + offset) / 2;
else
matchPoint = (thePlayer - lowExt + numOfPlayer - 1) / 2;
//重新比赛
for (; matchPoint >= 1; matchPoint /= 2)
{
int loserTemp = loser(tree[matchPoint], thePlayer);
temp[matchPoint] = winner(tree[matchPoint], thePlayer);
tree[matchPoint] = loserTemp;
thePlayer = temp[matchPoint];
}
tree[0] = temp[1];
}
void randomGenerateFile() {
ofstream FILE("testSpace.txt", ios::out);
srand((unsigned)time(nullptr));
//1000000
for (int i = 0; i < 20; i++)
FILE << rand() << " ";
FILE.close();
}
int main(void) {
randomGenerateFile();
//50000 小数据20->35
int Space = 35; //内存总大小
int kx = 1, inputBuffer = 20000, outputBuffer = 15000;
int k = 60; //设置归并路数
cout << "输入缓冲区大小: ";
cin >> inputBuffer;
cout << "输出缓冲区大小: ";
cin >> outputBuffer;
cout << "归并路数: ";
cin >> k;
//输入输出的缓冲区大小
int freeSpace = Space - (kx * inputBuffer + outputBuffer); //剩余的缓冲区总大小(用于排序)
int inputBufferAll = kx * inputBuffer;
//开辟各部分空间
int *input = new int[inputBufferAll];
int *output = new int[outputBuffer];
Sequece *free = new Sequece[freeSpace + 1]; //1表示下标从1开始
//从文件中读入数据,生成顺串文件
ifstream allPlayerFile("testSpace.txt"); //绑定所有选手的文件
int sequeceNum = 1;
//先初始化
//=======================//
DISK_NUM++;
//=======================//
for (int i = 0; i<inputBufferAll; i++) {
//向输入缓冲区填充数据
bool flag = true;
if (allPlayerFile >> input[i])
flag = false;
if(flag) input[i] = INF;
}
int inputIndex = 0;
for (int i = 1; i <= freeSpace; i++) {
free[i].val = input[inputIndex++]; //初始添加至外部节点
if (free[i].val == INF) free[i].num = INF;
else free[i].num = 1;
}
loserTree<Sequece> generateSequece(freeSpace);
generateSequece.initialize(free, freeSpace);
//处理输出缓冲区
int outputIndex = 0;
while (true) {
int wi = generateSequece.theWinner(); //取出一个赢家
//输出缓冲区满,则刷新输出缓冲区
if (outputIndex >= outputBuffer) {
//=======================//
DISK_NUM++;
//=======================//
string ofn = "inputSequece_";
string bfn = ".txt";
ofstream outputFile;
outputFile.open(ofn + to_string(sequeceNum) + bfn, ios::out | ios::app);
for (int i = 0; i < outputIndex; i++)
outputFile << output[i] << " ";
outputIndex = 0;
if (free[wi].num != sequeceNum) sequeceNum++; //刷新缓冲区时序号不同,总串数+1
outputFile.close();
}
//刷新输入缓冲区
if (free[wi].val == INF) {
//清空输出缓冲区
if (outputIndex != 0) {
//=======================//
DISK_NUM++;
//=======================//
string ofn = "inputSequece_";
string bfn = ".txt";
ofstream outputFile;
outputFile.open(ofn + to_string(sequeceNum) + bfn, ios::out | ios::app);
for (int i = 0; i < outputIndex; i++)
outputFile << output[i] << " ";
outputIndex = 0;
sequeceNum++;
outputFile.close();
}
input[0] = INF;
if (allPlayerFile >> input[0]) ;
if (input[0] == INF) {
allPlayerFile.close();
break; //文件输入结束
}
//向输入缓冲区填充数据
//=======================//
DISK_NUM++;
//=======================//
for (int i = 1; i < inputBufferAll; i++) {
bool flag = true;
if (allPlayerFile >> input[i])
flag = false;
if (flag) input[i] = INF;
}
inputIndex = 0;
for (int i = 1; i <= freeSpace; i++) {
free[i].val = input[inputIndex++]; //初始添加至外部节点
if (free[i].val == INF) free[i].num = INF;
else free[i].num = sequeceNum;
}
generateSequece.initialize(free, freeSpace);
continue; //千万注意不能少!!!
}
//判断是否输出顺串文件
if (free[wi].num != sequeceNum) {
//=======================//
DISK_NUM++;
//=======================//
string ofn = "inputSequece_";
string bfn = ".txt";
ofstream outputFile;
outputFile.open(ofn + to_string(sequeceNum) + bfn, ios::out | ios::app);
for (int i = 0; i < outputIndex; i++)
outputFile << output[i] << " ";
outputIndex = 0;
sequeceNum++;
outputFile.close();
}
output[outputIndex++] = free[wi].val;
//向缓冲区取下一个替换的玩家
Sequece temp_insert;
if (inputIndex >= inputBufferAll) {
temp_insert.val = INF;
temp_insert.num = INF;
}
else {
temp_insert.val = input[inputIndex];
if (input[inputIndex] >= free[wi].val) {
if(input[inputIndex]==INF) temp_insert.num = INF;
else temp_insert.num = free[wi].num;
}
else
temp_insert.num = free[wi].num + 1;
inputIndex++;
}
generateSequece.replay(wi, temp_insert);
}
//开始k路归并
int inputBufferPerSize = inputBufferAll / 5;
int outputBufferSize = Space - k - inputBufferPerSize * 5;
int *merge_outputBuffer = new int[outputBufferSize];
priority_queue<File> allFilePQ; //记录当前可作为归并排序输入的文件名
string ifn = "inputSequece_";
string bfn = ".txt";
//初始化当前顺串们的所有文件到优先队列中
for (int i = 1; i <= sequeceNum - 1; i++) {
string fn = ifn + to_string(i) + bfn;
File temp_file;
temp_file.filename = fn;
temp_file.initData();
//=======================//
DISK_NUM++;
//=======================//
temp_file.initBuffer(inputBufferPerSize);
allFilePQ.push(temp_file);
}
int outputSeqNum = 1; //当前输出顺串编号
while (allFilePQ.size() != 1) {
File *processFiles = new File[k + 1]; //正在处理的文件
int fileNumK = 0; //最大值为k(最大为k路归并)
for (int i = 1; i <= k; i++) {
if (allFilePQ.size() >= 1) {
processFiles[i] = allFilePQ.top();
allFilePQ.pop();
fileNumK = i;
}
else break; //优先队列中的文件都已经遍历了
}
loserTree<int> merge(fileNumK); //k路归并的输者树
int *mergeFree = new int[fileNumK + 1]; //归并时的外部节点,下标从1开始
//初始化外部节点(全为新文件,即不需要刷新缓冲区)
for (int i = 1; i <= fileNumK; i++) {
mergeFree[i] = processFiles[i].buffer[0];
processFiles[i].bufferIndex++; //索引前进
}
merge.initialize(mergeFree, fileNumK);
//正式开始生成输出的顺串
int outputBufferIndex = 0; //输出缓冲区的索引
string ofn = "outputSequece_" + to_string(outputSeqNum) + bfn; //预备输出的文件名
ofstream outputfff(ofn);
while (true) {
int wi = merge.theWinner();
//输出缓冲区满,刷新
if (outputBufferIndex >= outputBufferSize) {
//=======================//
DISK_NUM++;
//=======================//
for (int i = 0; i < outputBufferSize; i++)
outputfff << merge_outputBuffer[i] << " ";
outputBufferIndex = 0;
}
//归并完成
if (mergeFree[wi] == INF) {
if (outputBufferIndex != 0) {
//=======================//
DISK_NUM++;
//=======================//
for (int i = 0; i < outputBufferIndex; i++)
outputfff << merge_outputBuffer[i] << " ";
outputBufferIndex = 0;
}
break;
}
//赢家写入输出缓冲区
merge_outputBuffer[outputBufferIndex++] = mergeFree[wi];
//从缓冲区取下一个元素
int i_t = processFiles[wi].bufferIndex;
if (i_t >= inputBufferPerSize) {
//单个缓冲区刷新
//=======================//
DISK_NUM++;
//=======================//
processFiles[wi].initBuffer(inputBufferPerSize);
if (processFiles[wi].buffer[0] != INF) {
mergeFree[wi] = processFiles[wi].buffer[0];
processFiles[wi].bufferIndex++;
}
else
mergeFree[wi] = INF;
}
else {
mergeFree[wi] = processFiles[wi].buffer[i_t];
processFiles[wi].bufferIndex++;
}
merge.replay(wi, mergeFree[wi]);
}
outputfff.close();
//一次归并完成,将新文件压入优先队列
outputSeqNum++;
ifstream inputfff(ofn);
File temp_file;
temp_file.filename = ofn;
temp_file.initData();
temp_file.initBuffer(inputBufferPerSize);
allFilePQ.push(temp_file);
}
string last = allFilePQ.top().filename;
cout << "[EXTERNAL SORT SUCCESSFULLY] in file : " << last << endl;
cout << "[DISK VISIT] totally: " << DISK_NUM << endl;
system("pause");
return 0;
}