排序 Java 大文件排序

话不多说,直接上代码

排序Java代码实现

import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.stream.Collectors;

/**
 * 排序
 *
 * @author Adrian
 * @date 2020/12/8 16:02
 */
public class SelfSort {
    /**
     * 生成百万行数据
     */
    private static final Long GENERATE_ROW = 1000000L;

    /**
     * 拆分小文件,每个小文件行数
     */
    private static final Integer MAX_ROW = 10000;
    /**
     * 基础文件夹
     */
    private static final String BASE_FOLDER = "d:/test/";

    public static void main(String[] args) {
        deleteFolderFiles(BASE_FOLDER);
        String bigTxt = BASE_FOLDER + "big.txt";
        String splitFolder = BASE_FOLDER + "/folder/";
        String outSortFile = BASE_FOLDER + "sort.txt";
        List<File> files = null;
        try {
            randomData(bigTxt);
            files = splitDataToSaveFile(bigTxt, splitFolder);
            sortSmallFiles(files);
        } catch (IOException e) {
            e.printStackTrace();
        }
        List<SmallFileInfo> fileInfos = getSmallFileBufferedReader(files);
        try {
            mergeSort(outSortFile, fileInfos);
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            fileInfos.forEach(t -> {
                BufferedReader reader = t.getReader();
                if (null != reader) {
                    try {
                        reader.close();
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }
            });
        }
        deleteFolderFiles(splitFolder);
    }


    /**
     * 删除文件夹下所有文件
     *
     * @param folderPath
     */
    public static void deleteFolderFiles(String folderPath) {
        File folderFile = new File(folderPath);
        if (!folderFile.exists() || !folderFile.isDirectory()) {
            folderFile.mkdirs();
        }
        File[] files = folderFile.listFiles();
        if (null != files && files.length > 0) {
            for (File f : files) {
                f.delete();
            }
        }
    }

    /**
     * 模拟生成数据
     */
    public static void randomData(String randFilePath) throws IOException {
        System.out.println("=========开始生成数据==========");
        long startTime = System.currentTimeMillis();

        File file = new File(randFilePath);
        try (BufferedWriter bufferedWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, true), StandardCharsets.UTF_8))) {
            if (file.exists()) {
                if (file.isDirectory()) {
                    throw new IOException("File '" + file + "' exists but is a directory");
                }
                if (!file.canWrite()) {
                    throw new IOException("File '" + file + "' cannot be written to");
                }
            } else {
                File parent = file.getParentFile();
                if (parent != null && !parent.exists() && !parent.mkdirs()) {
                    throw new IOException("File '" + file + "' could not be created");
                }
            }

            Random random = new Random();
            for (long i = 0; i < GENERATE_ROW; i++) {
                long value = random.nextLong();
                value = value < 0 ? -value : value;
                //写入并换行
                bufferedWriter.write(value + "\r\n");
            }
        }
        long endTime = System.currentTimeMillis();
        System.out.println("生成结束,耗时:" + (endTime - startTime) / 1000 + "秒");
    }

    public static List<File> splitDataToSaveFile(String sourceFilePath, String targetDirectoryPath) throws IOException {
        long startTime = System.currentTimeMillis();
        System.out.println("=========开始分割文件==========");
        List<File> fileList = new ArrayList<>();
        File sourceFile = new File(sourceFilePath);
        File targetFile = new File(targetDirectoryPath);
        if (sourceFile.exists()) {
            if (sourceFile.isDirectory()) {
                throw new IOException("File '" + sourceFile + "' exists but is a directory");
            }
            if (!sourceFile.canWrite()) {
                throw new IOException("File '" + sourceFile + "' cannot be written to");
            }
        }
        if (targetFile.exists()) {
            if (!targetFile.isDirectory()) {
                throw new IOException("File '" + targetFile + "' exists but is a directory");
            }
        } else {
            targetFile.mkdirs();
        }
        try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(sourceFile)))) {
            StringBuilder stringBuilder = new StringBuilder();
            String lineStr;
            int lineNo = 1, fileNum = 1;
            while ((lineStr = bufferedReader.readLine()) != null) {
                stringBuilder.append(lineStr.trim()).append("\r\n");
                if (lineNo % MAX_ROW == 0) {
                    File file = new File(targetDirectoryPath + File.separator + fileNum + sourceFile.getName());
                    writeFile(stringBuilder.toString(), file, false);
                    //清空文本
                    stringBuilder.setLength(0);
                    fileNum++;
                    fileList.add(file);
                }
                lineNo++;
            }
            if ((lineNo - 1) % MAX_ROW != 0) {
                File file = new File(targetDirectoryPath + File.separator + fileNum + sourceFile.getName());
                writeFile(stringBuilder.toString(), file, false);
                fileList.add(file);
            }

            long endTime = System.currentTimeMillis();
            System.out.println("分割文件结束,耗时:" + (endTime - startTime) / 1000 + "秒");
        } catch (Exception e) {
            System.out.println("分割文件异常:");
            e.printStackTrace();
        }
        return fileList;
    }

    /**
     * 对小文件排序
     *
     * @param files
     * @throws IOException
     */
    public static void sortSmallFiles(List<File> files) throws IOException {
        long startTime = System.currentTimeMillis();
        System.out.println("=========开始对所有小文件内部排序==========");
        for (File file : files) {
            List<Long> list = new ArrayList<>();
            String lineStr = null;
            try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(file)))) {
                while ((lineStr = bufferedReader.readLine()) != null) {
                    list.add(Long.parseLong(lineStr.trim()));
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
            Collections.sort(list);
            String collect = list.stream().map(String::valueOf).collect(Collectors.joining("\r\n"));
            writeFile(collect, file, false);
        }
        long endTime = System.currentTimeMillis();
        System.out.println("所有小文件内部排序结束,耗时:" + (endTime - startTime) / 1000 + "秒");
    }

    /**
     * 排序合并所有小文件
     *
     * @param outSortFile
     * @param fileInfos
     */
    public static void mergeSort(String outSortFile, List<SmallFileInfo> fileInfos) throws IOException {
        long startTime = System.currentTimeMillis();
        System.out.println("=========开始对所有小文件合并并排序==========");
        File outFile = new File(outSortFile);
        File parent = outFile.getParentFile();
        if (parent != null && !parent.exists() && !parent.mkdirs()) {
            throw new IOException("File '" + outFile + "' could not be created");
        }
        if (!outFile.exists() || outFile.isDirectory()) {
            outFile.createNewFile();
        }
        // 升序比较器
        Comparator<Map.Entry<SmallFileInfo, Long>> valueComparator = new Comparator<Map.Entry<SmallFileInfo, Long>>() {
            @Override
            public int compare(Map.Entry<SmallFileInfo, Long> o1, Map.Entry<SmallFileInfo, Long> o2) {
                return o1.getValue().compareTo(o2.getValue());
            }
        };
        for (int i = 0; i < GENERATE_ROW; i++) {
//            System.out.println("i=======" + i);
            Map<SmallFileInfo, Long> map = new HashMap<>();
            for (SmallFileInfo fileInfo : fileInfos) {
                String nextValue = fileInfo.readNextValue();
                if (null != nextValue && !"".equals(nextValue)) {
                    map.put(fileInfo, Long.parseLong(fileInfo.getValue()));
                }
            }
            // map转换成list进行排序
            List<Map.Entry<SmallFileInfo, Long>> list = new ArrayList<Map.Entry<SmallFileInfo, Long>>(map.entrySet());
            // 排序
            Collections.sort(list, valueComparator);
            Map.Entry<SmallFileInfo, Long> fileLongEntry = list.get(0);
            SmallFileInfo key = fileLongEntry.getKey();
            Long value = fileLongEntry.getValue();
            key.setValueIsUse(true);
            writeFile(value + "\r\n", outFile, true);
        }

        long endTime = System.currentTimeMillis();
        System.out.println("所有小文件合并排序结束,耗时:" + (endTime - startTime) / 1000 + "秒");
    }

    /**
     * 获取文件引用,为下一步文件合并排序做准备
     *
     * @param files
     * @return
     */
    public static List<SmallFileInfo> getSmallFileBufferedReader(List<File> files) {
        List<SmallFileInfo> fileInfos = new ArrayList<>();
        for (File file : files) {
            if (file.exists()) {
                BufferedReader bufferedReader = null;
                try {
                    bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
                } catch (FileNotFoundException e) {
                    e.printStackTrace();
                    continue;
                }
                SmallFileInfo smallFileInfo = new SmallFileInfo();
                smallFileInfo.setLineNum(0L);
                smallFileInfo.setValue(null);
                smallFileInfo.setValueIsUse(true);
                smallFileInfo.setReader(bufferedReader);
                smallFileInfo.setFileAbsolutePath(file.getAbsolutePath());
                fileInfos.add(smallFileInfo);
            }
        }
        return fileInfos;
    }


    /**
     * @param text     要写入的内容
     * @param file     目标文件
     * @param isAppend true为追加写入,false为覆盖写入
     */
    private static void writeFile(String text, File file, Boolean isAppend) {
        try (BufferedWriter bufferedWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, isAppend)), 1024)) {
            bufferedWriter.write(text);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

import java.io.BufferedReader;
import java.io.IOException;
import java.util.Objects;

/**
 * 文件信息
 *
 * @author Adrian
 * @date 2020/12/9 10:12
 */
public class SmallFileInfo {

    /**
     * 改行value是否使用
     */
    private Boolean valueIsUse;

    /**
     * 当前行号
     */
    private Long lineNum;

    /**
     * 当前一行值
     */
    private String value;

    /**
     * 文件绝对路径
     */
    private String fileAbsolutePath;

    /**
     * BufferedReader引用
     */
    private BufferedReader reader;

    public String readNextValue(){
        //判断上一轮读取的值是否使用
        if (this.valueIsUse) {
            try {
                String line = null;
                if ((line = this.reader.readLine()) != null) {
                    this.value = line;
//                    System.out.println("当前读取行数为:" + this.lineNum + ",value为:" + this.value);
                    this.lineNum++;
                    this.valueIsUse = false;
                    return this.value;
                }
                //读完了
                else {
                    return null;
                }
            } catch (IOException e) {
                e.printStackTrace();
                return null;
            }
        }
        //否则返回上一轮读取的值
        else {
            return this.value;
        }
    }

    public Boolean getValueIsUse() {
        return valueIsUse;
    }

    public void setValueIsUse(Boolean valueIsUse) {
        this.valueIsUse = valueIsUse;
    }

    public Long getLineNum() {
        return lineNum;
    }

    public void setLineNum(Long lineNum) {
        this.lineNum = lineNum;
    }

    public String getValue() {
        return value;
    }

    public void setValue(String value) {
        this.value = value;
    }

    public String getFileAbsolutePath() {
        return fileAbsolutePath;
    }

    public void setFileAbsolutePath(String fileAbsolutePath) {
        this.fileAbsolutePath = fileAbsolutePath;
    }

    public BufferedReader getReader() {
        return reader;
    }

    public void setReader(BufferedReader reader) {
        this.reader = reader;
    }

    @Override
    public boolean equals(Object obj) {
        if (this == obj) {
            return true;
        }
        if (obj == null || this.getClass() != obj.getClass()) {
            return false;
        }
        SmallFileInfo fileInfo = (SmallFileInfo) obj;
        return this.fileAbsolutePath.equals(fileInfo.getFileAbsolutePath());
    }

    @Override
    public int hashCode() {
        return Objects.hash(this.fileAbsolutePath);
    }
}

再来说下排序原理

如图:(图片来源于网络,如果侵权,请联系我删除)
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

其他说明

  1. 该方法是线程不安全的
  2. 当前选用的是Long,若是规律的复杂字符串类型,自行修改排序方式即可
  3. 代码为按照解决思路自行编写,自行测试OK,若有BUG请联系我
  4. 代码还可以继续精简,若有更好的实现方式,请提出
  5. 执行代码耗时较长,测试时,建议十万数据量
  6. 如需转载,请注明来源
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值