package wjw.PreTrans;
import java.io.*;
import java.util.*;
import org.apache.commons.io.*;
public class MergeSort {
private static
FastQSortAlgorithm QQ = new
FastQSortAlgorithm();
private MergeSort() {
}
private static int getFileLineSize(String
fileC) throws IOException {
Reader fC = null;
try {
fC =
new FileReader(fileC);
LineIterator it = IOUtils.lineIterator(fC);
int
lineSize = 0;
while(it.hasNext()) {
it.nextLine();
lineSize++;
}
return lineSize;
} finally {
IOUtils.closeQuietly(fC);
}
}
private static String nextLine(LineIterator
iterator) {
if(iterator.hasNext())
{
return iterator.nextLine();
} else {
return null;
}
}
private static void readLines(LineIterator
iterator, List bufList, int
lines) {
for(int i = 0; i
< lines; i++) {
if(!iterator.hasNext()) {
break;
}
bufList.add(iterator.nextLine());
}
}
private static void split(String fileA,
String fileB, String fileC, int k,
int blockSize) throws
FileNotFoundException, IOException {
boolean useA =
true;
int i = 0;
List bufList = new
ArrayList(blockSize);
//大小为blockSize的缓冲区
Writer fA = null;
Writer fB = null;
Reader fC = null;
try {
fA =
new BufferedWriter(new FileWriter(fileA));
fB =
new BufferedWriter(new FileWriter(fileB));
fC =
new FileReader(fileC);
LineIterator itC = IOUtils.lineIterator(fC);
while(itC.hasNext()) {
//->读入数据块
bufList.clear();
readLines(itC, bufList, blockSize);
//
if(useA) {
IOUtils.writeLines(bufList,
"\n", fA);
} else {
IOUtils.writeLines(bufList,
"\n", fB);
}
if(++i == k) {
i = 0;
useA = !useA;
}
}
} finally {
bufList.clear();
IOUtils.closeQuietly(fA);
IOUtils.closeQuietly(fB);
IOUtils.closeQuietly(fC);
}
}
private static int copyTail(LineIterator
fileX, Writer fileY, int currRunPos,
int n)
throws IOException {
//从当前位置到归并段结束,拷贝每个数据
while(currRunPos
<= n) {
//若没有更多的数据项,则文件结束且归并段结束
if(!fileX.hasNext()) {
break;
}
//修改当前归并段位置并将数据项写入fY
currRunPos++;
IOUtils.write(fileX.nextLine() + "\n", fileY);
}
return currRunPos;
}
private static void merge(String fileA,
String fileB, String fileC, int n) throws
IOException {
//currA和currB表示在当前归并段中的位置
int currA = 1;
int currB = 1;
//分别从fA和fB中读出的数据项
String dataA,
dataB;
Reader fA = null;
Reader fB = null;
Writer fC = null;
try {
fA =
new FileReader(fileA);
fB =
new FileReader(fileB);
fC =
new BufferedWriter(new FileWriter(fileC));
LineIterator itA = IOUtils.lineIterator(fA);
LineIterator itB = IOUtils.lineIterator(fB);
dataA
= nextLine(itA);
dataB
= nextLine(itB);
for(;
; ) {
//若(dataA<=dataB),则将dataA拷贝到fC并修改当前归并段的位置
if(dataA.compareTo(dataB) <= 0)
{
IOUtils.write(dataA + "\n",
fC);
//从fA中取下一归并段,若不存在,则已到文件尾,应将fB的后续归并段拷入到fC;
//若当前位置>n,则已将所有fA的归并段拷完,应拷贝fB的后续归并段
dataA = nextLine(itA);
currA++;
if(dataA == null || currA
> n) {
IOUtils.write(dataB + "\n", fC);
currB++;
currB =
copyTail(itB, fC, currB, n);
//fA的大小>=fB的大小;若在fA的文件尾,则结束
if(dataA
== null) {
break;
} else {
//否则,应在新的归并段中,重置当前位置
currA = 1;
}
//取fB中的下一项.若不存在,则只有fA中剩余的部分要拷贝到fC,
//退出循环前将当前归并段写入fC
dataB =
nextLine(itB);
if(dataB
== null) {
IOUtils.write(dataA + "\n", fC);
currA = 2;
break;
} else {
//否则,重置fB中当前归并段
currB = 1;
}
}
} else { //否则(dataA>dataB)
IOUtils.write(dataB + "\n",
fC);
//从fB中取下一归并段,若不存在,则已到文件尾,应将fA的后续归并段拷入到fC;
//若当前位置>n,则已将所有fB的归并段拷完,应拷贝fA的后续归并段
dataB = nextLine(itB);
currB++;
if(dataB == null || currB
> n) {
IOUtils.write(dataA + "\n", fC);
currA++;
currA =
copyTail(itA, fC, currA, n);
//若fB中没有更多项,则置fA的当前位置,准备拷贝fA中的最后归并段到fC中
if(dataB
== null) {
currA = 1;
break;
} else {
//否则,置fB的当前位置,并从fA中读入数据
currB = 1;
if((dataA = nextLine(itA)) == null) {
break;
} else {
currA = 1;
}
}
}
}
}
//
//将fA中可能存在的剩余的归并段写入fC中(注:fA的长度时>=fB的)
if(dataA != null && dataB == null)
{
currA = copyTail(itA, fC, currA, n);
}
} finally {
IOUtils.closeQuietly(fA);
IOUtils.closeQuietly(fB);
IOUtils.closeQuietly(fC);
}
}
public static void sort(String
fileSource,String fileOut, int blockSize,boolean removeDuple)
throws IOException {
String fileA =
File.createTempFile("wjw", null).getAbsolutePath();
String fileB =
File.createTempFile("wjw", null).getAbsolutePath();
int mergeIndex =
1;
int lineSize =
getFileLineSize(fileSource);
int k = 1;
int n = k *
blockSize;
boolean useA =
true;
List list = new
ArrayList(blockSize);
Writer fA = null;
Writer fB = null;
Reader fC = null;
try {
fA =
new BufferedWriter(new FileWriter(fileA));
fB =
new BufferedWriter(new FileWriter(fileB));
fC =
new FileReader(fileSource);
LineIterator itC = IOUtils.lineIterator(fC);
if(lineSize <= blockSize) {
//对于小文件,从fC读入数据,直接排序后写回文件中
readLines(itC, list, lineSize);
Collections.sort(list);
IOUtils.closeQuietly(fC);
FileUtils.writeLines(new File(fileOut), "GBK",
list, "\n");
list.clear();
if(removeDuple) {
removeDuple(fileOut);
}
return;
}
//->第一次分割,合并
System.out.println("第:"+mergeIndex+"分割,合并");
while(itC.hasNext()) {
list.clear();
readLines(itC, list, blockSize);
Collections.sort(list);
if(useA) {
IOUtils.writeLines(list,
"\n", fA);
} else {
IOUtils.writeLines(list,
"\n", fB);
}
useA = !useA;
}
list.clear();
IOUtils.closeQuietly(fA);
IOUtils.closeQuietly(fB);
IOUtils.closeQuietly(fC);
merge(fileA, fileB, fileOut, blockSize);
mergeIndex++;
//
//->将当前归并段大小加倍,循环进行
k = k
* 2;
n = k
* blockSize;
while(n < lineSize) {
//当n>=文件大小时,fC仅剩一个已排好序的归并段
System.out.println("第:"+mergeIndex+"分割,合并");
split(fileA, fileB, fileOut, k,
blockSize);
merge(fileA, fileB, fileOut, n);
mergeIndex++;
k = k * 2;
n = k * blockSize;
}
//->将当前归并段大小加倍,循环进行
} finally {
IOUtils.closeQuietly(fA);
IOUtils.closeQuietly(fB);
IOUtils.closeQuietly(fC);
(new
File(fileA)).delete();
(new
File(fileB)).delete();
}
if(removeDuple) {
removeDuple(fileOut);
}
}
private static void removeDuple(String
fileC) throws IOException {
System.out.println("去重");
Reader fC = null;
Writer fTemp =
null;
File tempFile =
File.createTempFile("wjw", null);
try {
fC =
new FileReader(fileC);
fTemp
= new BufferedWriter(new FileWriter(tempFile));
String tmpA = "";
String tmpB = "";
LineIterator itC = IOUtils.lineIterator(fC);
while(itC.hasNext()) {
tmpB = itC.nextLine();
if(tmpB.compareTo(tmpA) != 0) {
IOUtils.write(tmpB + "\n",
fTemp);
tmpA = tmpB;
}
}
} finally {
IOUtils.closeQuietly(fTemp);
IOUtils.closeQuietly(fC);
}
File cFile = new
File(fileC);
if(cFile.delete())
{
if(tempFile.renameTo(cFile)) {
tempFile.delete();
}
}
}
public static String formatSecond(long
seconds) {
long h = seconds
/(60*60);
StringBuffer sb = new
StringBuffer();
sb.append(h+"小时");
seconds =
seconds%(60*60);
long c = seconds
/60;
sb.append(c+"分");
sb.append(seconds
`+"秒");
return
sb.toString();
}
public static void main(String args[])
{
try {
String fileName = "d:/ESort.txt";
int
blockSize = 100000;
long
c1 = System.currentTimeMillis();
MergeSort.sort(fileName,fileName + "_SORT", blockSize,true);
long
c2 = (System.currentTimeMillis() - c1) / 1000;
System.out.println("耗时:"+formatSecond(c2));
} catch(IOException ex)
{
ex.printStackTrace();
}
}
}