1.创造原文件
package www;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.util.Random;
public class Test {
public static void main(String[] args) throws Exception {
Random r = new Random();
FileWriter fw = new FileWriter("t1.txt");
BufferedWriter bw = new BufferedWriter(fw);
for (int x = 0; x < 5; x++) {
int[] larray = new int[10000000];
for (int i = 0; i < larray.length; i++) {
larray[i] = r.nextInt(100000000);
}
for (int i : larray) {
bw.write(String.valueOf(i));
bw.newLine();
}
}
bw.flush();
bw.close();
fw.close();
}
}
2.归并文件算法实现
package www;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
public class Test2 {
static int BLOCK_SIZE = 33 * 1024 * 1024;
static String DIR_PATH = "temp";
static String root = "t1.txt";
public static void main(String[] args) {
System.out.println("====清空工作文件夹!====");
File dir = new File(DIR_PATH);
dir.mkdir();
File[] listFiles = dir.listFiles();
for (File file : listFiles) {
file.delete();
}
merge_sort(root, DIR_PATH);
// printFile("temp\\3_1.txt", true);
}
static void merge_sort(String root, String workDir) {
// ---------------
// 文件分割部分
// ---------------
splitBigFile(root, workDir);
// ---------------
// 文件分合并部分
// ---------------
mergeFile(workDir);
}
private static void mergeFile(String workDir) {
System.out.println("====开始归并数据====");
printFreeMemory();
Integer mergeTime = 0;
while (true) {
File wdir = new File(workDir);
String fileHeader = mergeTime + "_";
FileFilter fileFilter = new FileFilter() {
@Override
public boolean accept(File pathname) {
return pathname.getName().indexOf(fileHeader) > -1;
}
};
mergeTime = mergeTime + 1;
int mergeNewTime = 1;
File[] listFiles = wdir.listFiles(fileFilter);
if (listFiles.length == 1) {
System.err.println("====结束归并,最后合成文件为:" + listFiles[0].getName() + " ====");
printFile(listFiles[0].getAbsolutePath(), false);
break;
}
for (int i = 0; i < listFiles.length; i++) {
if (listFiles.length % 2 != 0 && i == (listFiles.length - 1)) {
File newFile = new File(workDir + File.separator + mergeTime + "_" + mergeNewTime + ".txt");
System.err.println("==== 单数文件块,提升为下步骤归并块 ===");
System.out.println(listFiles[i].getName() + " --> " + newFile.getName());
listFiles[i].renameTo(newFile);
printFreeMemory();
break;
}
File in1 = listFiles[i];
++i;
File in2 = listFiles[i];
merge(in1.getAbsolutePath(), in2.getAbsolutePath(),
workDir + File.separator + mergeTime + "_" + mergeNewTime + ".txt");
++mergeNewTime;
}
}
System.out.println("====归并数据结束====");
}
static void merge(String in_path1, String in_path2, String out_path) {
System.out.println("====开始归并数据块文件====");
System.out.println("====in1 :" + in_path1 + " in2:" + in_path2 + " out:" + out_path);
printFreeMemory();
// 解耦
InputBuffer ib1 = new InputBuffer(BLOCK_SIZE, in_path1);
InputBuffer ib2 = new InputBuffer(BLOCK_SIZE, in_path2);
OutputBuffer ob = new OutputBuffer(BLOCK_SIZE, out_path);
// 归并算法实现,归并部分实现
Integer r1 = null;
Integer r2 = null;
while (true) {
if (ib1.hashNext() && r1 == null) {
r1 = ib1.read();
}
if (ib2.hashNext() && r2 == null) {
r2 = ib2.read();
}
if (r1 != null && r2 != null) {
if (r1.intValue() <= r2.intValue()) {
ob.write(r1);
r1 = null;
} else {
ob.write(r2);
r2 = null;
}
} else if (r1 != null) {
ob.write(r1);
r1 = null;
} else if (r2 != null) {
ob.write(r2);
r2 = null;
} else { // r1 == null && r2 == null
break;
}
}
ib1.close();
ib2.close();
ob.close();
System.out.println("====归并数据块结束====");
printFreeMemory();
}
static class InputBuffer {
// http://ifeve.com/buffers/
ByteBuffer array;
String file_path;
FileChannel inChannel;
RandomAccessFile randomAccessFile;
CharsetDecoder utf8Decoder = Charset.forName("UTF-8").newDecoder();
// 大于0则有数据
int nextFlag = 0;
public InputBuffer(int capacity, String file_path) {
array = ByteBuffer.allocate(capacity);
array.flip();
this.file_path = file_path;
try {
randomAccessFile = new RandomAccessFile(new File(file_path), "rw");
inChannel = randomAccessFile.getChannel();
} catch (Exception e) {
e.printStackTrace();
}
}
public Integer read() {
Integer res = null;
if (!array.hasRemaining())
nextFlag = readFileDate();
ByteBuffer temp = ByteBuffer.allocate(48);
boolean readFirstEffect = false;
while (true) {
if (array.hasRemaining()) {
byte b = array.get();
// -回车(Carriage Return)即\r,ascii码13(0x0d),作用是将光标移到一行的开始位置
// - 换行(LineFeed)即\n,ascii码10(0x0a),作用是将光标移到下一行
// 去除空行、错误回车、错误换行
if (!readFirstEffect && (b == 0x0d || b == 0x0a))
continue;
if (b != 0x0d) {
readFirstEffect = true;
temp.put(b);
} else {
// 过滤\r\n
if (array.hasRemaining())
array.get();
break;
}
} else {
int readFileDate = readFileDate();
if (readFileDate == -1) {
nextFlag = readFileDate;
break;
}
}
}
// 解码
temp.flip();
if (temp.hasRemaining()) {
String dst = decoderByte(temp);
if (dst == null)
throw new NullPointerException();
else
res = Integer.parseInt(dst);
}
temp.clear();
return res;
}
private String decoderByte(ByteBuffer temp) {
try {
CharBuffer decode = utf8Decoder.decode(temp);
return decode.toString();
} catch (CharacterCodingException e) {
e.printStackTrace();
}
return null;
}
public boolean hashNext() {
return nextFlag != -1;
}
private int readFileDate() {
try {
array.clear();
int read = inChannel.read(array);
array.flip();
return read;
} catch (Exception e) {
e.printStackTrace();
}
return -1;
}
public void close() {
if (inChannel != null) {
try {
array.clear();
array = null;
inChannel.close();
randomAccessFile.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
static class OutputBuffer {
ByteBuffer array;
String file_path;
FileChannel outChannel;
RandomAccessFile randomAccessFile;
byte[] RN = new byte[] { 0x0d, 0x0a };
public OutputBuffer(int capacity, String file_path) {
array = ByteBuffer.allocate(capacity);
this.file_path = file_path;
try {
randomAccessFile = new RandomAccessFile(new File(file_path), "rw");
outChannel = randomAccessFile.getChannel();
} catch (Exception e) {
e.printStackTrace();
}
}
public void write(int val) {
ByteBuffer temp = ByteBuffer.allocate(48);
byte[] bytes = String.valueOf(val).getBytes();
temp.put(bytes);
temp.put(RN);
temp.flip();
int dataSize = temp.limit();
int remaining = array.limit() - array.position();
if (remaining >= dataSize) {
array.put(temp);
} else {
writeDisk();
array.put(temp);
}
}
private void writeDisk() {
try {
array.flip();
outChannel.write(array);
array.clear();
} catch (IOException e) {
e.printStackTrace();
}
}
public void close() {
if (outChannel != null) {
try {
writeDisk();
outChannel.force(true);
outChannel.close();
randomAccessFile.close();
array.clear();
array = null;
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
static void splitBigFile(String src, String workDir) {
printFreeMemory();
System.out.println("====开始分割文件====");
System.err.println("====原始文件:" + new File(src).length() / 1024 / 1024 + "====");
int loopTime = 0;
String fileHerader = "0_";
InputBuffer ib1 = new InputBuffer(BLOCK_SIZE, src);
Integer read;
int[] temp = new int[6500000];
int i = 0;
while ((read = ib1.read()) != null) {
temp[i] = read;
if (i == temp.length - 1) {
sort(temp, 0, temp.length - 1);
File file = new File(workDir, fileHerader + loopTime + ".txt");
OutputBuffer ob = new OutputBuffer(BLOCK_SIZE, file.getAbsolutePath());
for (int j : temp) {
ob.write(j);
}
ob.close();
++loopTime;
i = 0;
System.out.println("====分割文件:" + file.getName() + " ====");
printFreeMemory();
} else {
i++;
}
}
if (i != 0) {
int[] temp2 = new int[i];
for (int j = 0; j < temp2.length; j++) {
temp2[j] = temp[j];
}
sort(temp2, 0, temp2.length - 1);
File file = new File(workDir, fileHerader + loopTime + ".txt");
OutputBuffer ob = new OutputBuffer(BLOCK_SIZE, file.getAbsolutePath());
for (int j : temp2) {
ob.write(j);
}
ob.close();
System.out.println("====分割文件:" + file.getName() + " ====");
printFreeMemory();
}
System.out.println("====分割文件结束====");
printFreeMemory();
}
/**
* 快速排序
* @param a
* @param low
* @param hight
*/
public static void sort(int a[], int low, int hight) {
int i, j, index;
if (low > hight) {
return;
}
i = low;
j = hight;
index = a[i]; // 用子表的第一个记录做基准
while (i < j) { // 从表的两端交替向中间扫描
while (i < j && a[j] >= index)
j--;
if (i < j)
a[i++] = a[j];// 用比基准小的记录替换低位记录
while (i < j && a[i] < index)
i++;
if (i < j) // 用比基准大的记录替换高位记录
a[j--] = a[i];
}
a[i] = index;// 将基准数值替换回 a[i]
sort(a, low, i - 1); // 对低子表进行递归排序
sort(a, i + 1, hight); // 对高子表进行递归排序
}
static void printFreeMemory() {
System.out.println(" free memory:" + Runtime.getRuntime().freeMemory() / 1024 / 1024 + " mb");
}
static void printFile(String filePath, boolean showDetailed) {
InputBuffer ib = new InputBuffer(BLOCK_SIZE * 3, filePath);
Integer read;
int i = 0;
while ((read = ib.read()) != null) {
if (showDetailed)
System.out.println(read);
i++;
}
System.out.println("共:" + i + "行");
}
}