如何对一个超大文件进行排序,内存不够
我们这里使用三步骤:分割(首先将大文件分割成内存够的小文件)、排序(对每个小文件进行排序,我这里是写入的时候就排好序了)、合并(利用多路归并+败者树实现)
1.分割+排序
*
* 将大文件分割成小文件
* 排序好之后写入小文件
*
* */
public class Test2Main {
public static void main(String[] args) {
long start = System.currentTimeMillis();
StringBuilder builder = new StringBuilder(1024 * 1024 * 8);
int a = 0;
//这里每个文件存多少数据根据自己实际情况而定,我这里的数据是1024*1024*128,我一共分为了16个小文件.
int[] arr = new int[1024 * 1024 * 8];
try {
BufferedReader br = new BufferedReader
(new FileReader("syudy\\src\\L2_end\\file2\\jm.txt"));
String len;
int i = 0;
while ((len=br.readLine())!=null){
//每次只存你定好的数据量进数组
arr[a++] = Integer.parseInt(len);
//由于我把文件分为十六个,所以if语句中只进去十六次,刚好由i来生成十六个小文件
if (a == 1024 * 1024 * 8) {
//调用快速排序的方法对每个小文件排序
int[] k = KuaiSu.kuaisu(arr, 0, arr.length - 1);
i++;
for (int i1 = 0; i1 < k.length; i1++) {
builder.append(k[i1]);
builder.append("\n");
}
BufferedWriter bw = new BufferedWriter
(new FileWriter("syudy\\src\\L2_end\\file2\\" + i + ".txt"));
bw.write(builder.toString());
builder.setLength(0);
a = 0;
bw.close();
}
}
br.close();
}catch(Exception e){
e.printStackTrace();
}
快速排序
/*
*
* 快速排序
* */
public class KuaiSu {
public static int[] kuaisu(int[] arr,int start,int end){
int tou = arr[start];
int i = start;
int j = end;
while (i<j){
if (i<j && arr[i]<tou){
i++;
}
if (i<j && arr[j]>tou){
j--;
}
if (i<j && arr[i] == arr[j]){
i++;
}else {
int temp = arr[i];
arr[i] = arr[j];
arr[j] = temp;
}
}
if (i-1>start){
arr = kuaisu(arr,start,i-1);
}
if (j+1<end){
arr = kuaisu(arr,j+1,end);
}
return arr;
}
}
2.多路归并+败者树
1.败者树的实现
/*
* 败者树的实现
* */
public class FailedTree<T extends Comparable> {
private Integer[] tree = null;
private int size = 0;
private ArrayList<T> leaves = null;
public FailedTree(ArrayList<T> initValues) {
this.leaves = initValues;
this.size = initValues.size();
this.tree = new Integer[size];
for (int i = 0; i < size; i++) {
tree[i] = -1;
}
for (int i = size - 1; i >= 0; i--) {
adjust(i);
}
}
private void adjust(int s) {
int t = (s + size) / 2;
while (t > 0) {
if (s >= 0 && (tree[t] == -1 || leaves.get(s).compareTo(leaves.get(tree[t])) > 0)) {
int temp = s;
s = tree[t];
tree[t] = temp;
}
t /= 2;
}
tree[0] = s;
}
public void add(String leaf, int s) {
leaves.set(s, (T) leaf);
adjust(s);
}
public void del(int s) {
leaves.remove(s);
size--;
tree = new Integer[size];
for (int i = 0; i < size; i++) {
tree[i] = -1;
}
for (int i = size - 1; i >= 0; i--) {
adjust(i);
}
}
public T getLeaf(int s) {
return leaves.get(s);
}
public Integer getWinner() {
return tree.length > 0 ? tree[0] : null;
}
}
2.将多个有序的小文件合并成 一个有序的大文件
/*
* 多路归并
*
* */
public class GuiBing {
public static void merge(ArrayList<File> list) {
int fileSize = list.size();
if (fileSize == 1) {
return;
}
BufferedWriter out = null;
ArrayList<String> leaves = new ArrayList<>(fileSize);
try {
out = new BufferedWriter(new FileWriter("syudy\\src\\L2_end\\file2\\he.txt"));
ArrayList<BufferedReader> inputList = new ArrayList<>();
for (int i = 0; i < fileSize; i++) {
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(list.get(i))));
inputList.add(i, reader);
}
String data = "";
for (int i = 0; i < inputList.size(); i++) {
data = inputList.get(i).readLine();
leaves.add(data);
}
FailedTree<Integer> failedTree = new FailedTree(leaves);
Integer s = failedTree.getWinner();
out.write(failedTree.getLeaf(s) + "");
out.newLine();
out.flush();
while (inputList.size() > 0) {
String newLeaf = inputList.get(s).readLine();
if (newLeaf == null || newLeaf.equals("")) {
inputList.get(s).close();
int remove = s;
inputList.remove(remove);
failedTree.del(s);
} else {
failedTree.add(newLeaf, s);
}
s = failedTree.getWinner();
if (s == null) {
break;
}
out.write(failedTree.getLeaf(s) + "");
out.newLine();
out.flush();
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
out.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
上面多路归并中好像有点问题,
这个代码可以直接跑起来
这就实现了大文件排序而内存不足的问题,在大数据中应该会遇到这样的问题
在<<编程珠玑>>这本书中有这个问题,虽然我没有看过,可以去看一看