java对大文件的分割、排序、合并(多路归并+败者树)

如何对一个超大文件进行排序,内存不够

我们这里使用三步骤:分割(首先将大文件分割成内存够的小文件)、排序(对每个小文件进行排序,我这里是写入的时候就排好序了)、合并(利用多路归并+败者树实现)

1.分割+排序

*
* 将大文件分割成小文件
* 排序好之后写入小文件
*
* */
public class Test2Main {
    public static void main(String[] args) {
        long start = System.currentTimeMillis();
        StringBuilder builder = new StringBuilder(1024 * 1024 * 8);
        int a = 0;
        //这里每个文件存多少数据根据自己实际情况而定,我这里的数据是1024*1024*128,我一共分为了16个小文件.
        int[] arr = new int[1024 * 1024 * 8];
        try {
            BufferedReader br = new BufferedReader
                    (new FileReader("syudy\\src\\L2_end\\file2\\jm.txt"));
            String len;
            int i = 0;
            while ((len=br.readLine())!=null){
            //每次只存你定好的数据量进数组
                arr[a++] = Integer.parseInt(len);
                //由于我把文件分为十六个,所以if语句中只进去十六次,刚好由i来生成十六个小文件
                if (a == 1024 * 1024 * 8) {
                //调用快速排序的方法对每个小文件排序
                    int[] k = KuaiSu.kuaisu(arr, 0, arr.length - 1);
                    i++;
                    for (int i1 = 0; i1 < k.length; i1++) {
                        builder.append(k[i1]);
                        builder.append("\n");
                    }
                    BufferedWriter bw = new BufferedWriter
                            (new FileWriter("syudy\\src\\L2_end\\file2\\" + i + ".txt"));
                    bw.write(builder.toString());
                    builder.setLength(0);
                    a = 0;
                    bw.close();
                }
            }
            br.close();
        }catch(Exception e){
            e.printStackTrace();
        }

快速排序

/*
*
* 快速排序
* */

public class KuaiSu {
    public static int[] kuaisu(int[] arr,int start,int end){
        int tou  = arr[start];
        int i = start;
        int j = end;
        while (i<j){
            if (i<j && arr[i]<tou){
                i++;
            }
            if (i<j && arr[j]>tou){
                j--;
            }
            if (i<j && arr[i] == arr[j]){
                i++;
            }else {
                int temp = arr[i];
                arr[i] = arr[j];
                arr[j] = temp;
            }
        }
        if (i-1>start){
            arr = kuaisu(arr,start,i-1);
        }
        if (j+1<end){
            arr = kuaisu(arr,j+1,end);
        }
        return arr;
    }
}

2.多路归并+败者树

1.败者树的实现

/*
* 败者树的实现
* */

public class FailedTree<T extends Comparable> {
    private Integer[] tree = null;
    private int size = 0;
    private ArrayList<T> leaves = null;
    public FailedTree(ArrayList<T> initValues) {
        this.leaves = initValues;
        this.size = initValues.size();
        this.tree = new Integer[size];
        for (int i = 0; i < size; i++) {
            tree[i] = -1;
        }
        for (int i = size - 1; i >= 0; i--) {
            adjust(i);
        }
    }

    private void adjust(int s) {
        int t = (s + size) / 2;
        while (t > 0) {
            if (s >= 0 && (tree[t] == -1 || leaves.get(s).compareTo(leaves.get(tree[t])) > 0)) {
                int temp = s;
                s = tree[t];
                tree[t] = temp;
            }
            t /= 2;
        }
        tree[0] = s;
    }

    public void add(String leaf, int s) {
        leaves.set(s, (T) leaf);
        adjust(s);
    }

    public void del(int s) {
        leaves.remove(s);
        size--;
        tree = new Integer[size];
        for (int i = 0; i < size; i++) {
            tree[i] = -1;
        }
        for (int i = size - 1; i >= 0; i--) {
            adjust(i);
        }
    }

    public T getLeaf(int s) {
        return leaves.get(s);
    }

    public Integer getWinner() {
        return tree.length > 0 ? tree[0] : null;
    }
}

2.将多个有序的小文件合并成 一个有序的大文件

/*
* 多路归并
*
* */
public class GuiBing {
    public static void merge(ArrayList<File> list) {
        int fileSize = list.size();
        if (fileSize == 1) {
            return;
        }
        BufferedWriter out = null;
        ArrayList<String> leaves = new ArrayList<>(fileSize);
        try {
            out = new BufferedWriter(new FileWriter("syudy\\src\\L2_end\\file2\\he.txt"));
            ArrayList<BufferedReader> inputList = new ArrayList<>();
            for (int i = 0; i < fileSize; i++) {
                BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(list.get(i))));
                inputList.add(i, reader);
            }
            String data = "";
            for (int i = 0; i < inputList.size(); i++) {
                data = inputList.get(i).readLine();
                leaves.add(data);
            }
            FailedTree<Integer> failedTree = new FailedTree(leaves);
            Integer s = failedTree.getWinner();
            out.write(failedTree.getLeaf(s) + "");
            out.newLine();
            out.flush();
            while (inputList.size() > 0) {
                String newLeaf = inputList.get(s).readLine();
                if (newLeaf == null || newLeaf.equals("")) {
                    inputList.get(s).close();
                    int remove = s;
                    inputList.remove(remove);
                    failedTree.del(s);
                } else {
                    failedTree.add(newLeaf, s);
                }
                s = failedTree.getWinner();
                if (s == null) {
                    break;
                }
                out.write(failedTree.getLeaf(s) + "");
                out.newLine();
                out.flush();
            }
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                out.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

上面多路归并中好像有点问题,
这个代码可以直接跑起来
这就实现了大文件排序而内存不足的问题,在大数据中应该会遇到这样的问题
在<<编程珠玑>>这本书中有这个问题,虽然我没有看过,可以去看一看

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值