排序 Java 大文件排序
话不多说,直接上代码
排序Java代码实现
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.stream.Collectors;
/**
* 排序
*
* @author Adrian
* @date 2020/12/8 16:02
*/
public class SelfSort {
/**
* 生成百万行数据
*/
private static final Long GENERATE_ROW = 1000000L;
/**
* 拆分小文件,每个小文件行数
*/
private static final Integer MAX_ROW = 10000;
/**
* 基础文件夹
*/
private static final String BASE_FOLDER = "d:/test/";
public static void main(String[] args) {
deleteFolderFiles(BASE_FOLDER);
String bigTxt = BASE_FOLDER + "big.txt";
String splitFolder = BASE_FOLDER + "/folder/";
String outSortFile = BASE_FOLDER + "sort.txt";
List<File> files = null;
try {
randomData(bigTxt);
files = splitDataToSaveFile(bigTxt, splitFolder);
sortSmallFiles(files);
} catch (IOException e) {
e.printStackTrace();
}
List<SmallFileInfo> fileInfos = getSmallFileBufferedReader(files);
try {
mergeSort(outSortFile, fileInfos);
} catch (IOException e) {
e.printStackTrace();
} finally {
fileInfos.forEach(t -> {
BufferedReader reader = t.getReader();
if (null != reader) {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
});
}
deleteFolderFiles(splitFolder);
}
/**
* 删除文件夹下所有文件
*
* @param folderPath
*/
public static void deleteFolderFiles(String folderPath) {
File folderFile = new File(folderPath);
if (!folderFile.exists() || !folderFile.isDirectory()) {
folderFile.mkdirs();
}
File[] files = folderFile.listFiles();
if (null != files && files.length > 0) {
for (File f : files) {
f.delete();
}
}
}
/**
* 模拟生成数据
*/
public static void randomData(String randFilePath) throws IOException {
System.out.println("=========开始生成数据==========");
long startTime = System.currentTimeMillis();
File file = new File(randFilePath);
try (BufferedWriter bufferedWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, true), StandardCharsets.UTF_8))) {
if (file.exists()) {
if (file.isDirectory()) {
throw new IOException("File '" + file + "' exists but is a directory");
}
if (!file.canWrite()) {
throw new IOException("File '" + file + "' cannot be written to");
}
} else {
File parent = file.getParentFile();
if (parent != null && !parent.exists() && !parent.mkdirs()) {
throw new IOException("File '" + file + "' could not be created");
}
}
Random random = new Random();
for (long i = 0; i < GENERATE_ROW; i++) {
long value = random.nextLong();
value = value < 0 ? -value : value;
//写入并换行
bufferedWriter.write(value + "\r\n");
}
}
long endTime = System.currentTimeMillis();
System.out.println("生成结束,耗时:" + (endTime - startTime) / 1000 + "秒");
}
public static List<File> splitDataToSaveFile(String sourceFilePath, String targetDirectoryPath) throws IOException {
long startTime = System.currentTimeMillis();
System.out.println("=========开始分割文件==========");
List<File> fileList = new ArrayList<>();
File sourceFile = new File(sourceFilePath);
File targetFile = new File(targetDirectoryPath);
if (sourceFile.exists()) {
if (sourceFile.isDirectory()) {
throw new IOException("File '" + sourceFile + "' exists but is a directory");
}
if (!sourceFile.canWrite()) {
throw new IOException("File '" + sourceFile + "' cannot be written to");
}
}
if (targetFile.exists()) {
if (!targetFile.isDirectory()) {
throw new IOException("File '" + targetFile + "' exists but is a directory");
}
} else {
targetFile.mkdirs();
}
try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(sourceFile)))) {
StringBuilder stringBuilder = new StringBuilder();
String lineStr;
int lineNo = 1, fileNum = 1;
while ((lineStr = bufferedReader.readLine()) != null) {
stringBuilder.append(lineStr.trim()).append("\r\n");
if (lineNo % MAX_ROW == 0) {
File file = new File(targetDirectoryPath + File.separator + fileNum + sourceFile.getName());
writeFile(stringBuilder.toString(), file, false);
//清空文本
stringBuilder.setLength(0);
fileNum++;
fileList.add(file);
}
lineNo++;
}
if ((lineNo - 1) % MAX_ROW != 0) {
File file = new File(targetDirectoryPath + File.separator + fileNum + sourceFile.getName());
writeFile(stringBuilder.toString(), file, false);
fileList.add(file);
}
long endTime = System.currentTimeMillis();
System.out.println("分割文件结束,耗时:" + (endTime - startTime) / 1000 + "秒");
} catch (Exception e) {
System.out.println("分割文件异常:");
e.printStackTrace();
}
return fileList;
}
/**
* 对小文件排序
*
* @param files
* @throws IOException
*/
public static void sortSmallFiles(List<File> files) throws IOException {
long startTime = System.currentTimeMillis();
System.out.println("=========开始对所有小文件内部排序==========");
for (File file : files) {
List<Long> list = new ArrayList<>();
String lineStr = null;
try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(file)))) {
while ((lineStr = bufferedReader.readLine()) != null) {
list.add(Long.parseLong(lineStr.trim()));
}
} catch (IOException e) {
e.printStackTrace();
}
Collections.sort(list);
String collect = list.stream().map(String::valueOf).collect(Collectors.joining("\r\n"));
writeFile(collect, file, false);
}
long endTime = System.currentTimeMillis();
System.out.println("所有小文件内部排序结束,耗时:" + (endTime - startTime) / 1000 + "秒");
}
/**
* 排序合并所有小文件
*
* @param outSortFile
* @param fileInfos
*/
public static void mergeSort(String outSortFile, List<SmallFileInfo> fileInfos) throws IOException {
long startTime = System.currentTimeMillis();
System.out.println("=========开始对所有小文件合并并排序==========");
File outFile = new File(outSortFile);
File parent = outFile.getParentFile();
if (parent != null && !parent.exists() && !parent.mkdirs()) {
throw new IOException("File '" + outFile + "' could not be created");
}
if (!outFile.exists() || outFile.isDirectory()) {
outFile.createNewFile();
}
// 升序比较器
Comparator<Map.Entry<SmallFileInfo, Long>> valueComparator = new Comparator<Map.Entry<SmallFileInfo, Long>>() {
@Override
public int compare(Map.Entry<SmallFileInfo, Long> o1, Map.Entry<SmallFileInfo, Long> o2) {
return o1.getValue().compareTo(o2.getValue());
}
};
for (int i = 0; i < GENERATE_ROW; i++) {
// System.out.println("i=======" + i);
Map<SmallFileInfo, Long> map = new HashMap<>();
for (SmallFileInfo fileInfo : fileInfos) {
String nextValue = fileInfo.readNextValue();
if (null != nextValue && !"".equals(nextValue)) {
map.put(fileInfo, Long.parseLong(fileInfo.getValue()));
}
}
// map转换成list进行排序
List<Map.Entry<SmallFileInfo, Long>> list = new ArrayList<Map.Entry<SmallFileInfo, Long>>(map.entrySet());
// 排序
Collections.sort(list, valueComparator);
Map.Entry<SmallFileInfo, Long> fileLongEntry = list.get(0);
SmallFileInfo key = fileLongEntry.getKey();
Long value = fileLongEntry.getValue();
key.setValueIsUse(true);
writeFile(value + "\r\n", outFile, true);
}
long endTime = System.currentTimeMillis();
System.out.println("所有小文件合并排序结束,耗时:" + (endTime - startTime) / 1000 + "秒");
}
/**
* 获取文件引用,为下一步文件合并排序做准备
*
* @param files
* @return
*/
public static List<SmallFileInfo> getSmallFileBufferedReader(List<File> files) {
List<SmallFileInfo> fileInfos = new ArrayList<>();
for (File file : files) {
if (file.exists()) {
BufferedReader bufferedReader = null;
try {
bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
} catch (FileNotFoundException e) {
e.printStackTrace();
continue;
}
SmallFileInfo smallFileInfo = new SmallFileInfo();
smallFileInfo.setLineNum(0L);
smallFileInfo.setValue(null);
smallFileInfo.setValueIsUse(true);
smallFileInfo.setReader(bufferedReader);
smallFileInfo.setFileAbsolutePath(file.getAbsolutePath());
fileInfos.add(smallFileInfo);
}
}
return fileInfos;
}
/**
* @param text 要写入的内容
* @param file 目标文件
* @param isAppend true为追加写入,false为覆盖写入
*/
private static void writeFile(String text, File file, Boolean isAppend) {
try (BufferedWriter bufferedWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, isAppend)), 1024)) {
bufferedWriter.write(text);
} catch (IOException e) {
e.printStackTrace();
}
}
}
import java.io.BufferedReader;
import java.io.IOException;
import java.util.Objects;
/**
* 文件信息
*
* @author Adrian
* @date 2020/12/9 10:12
*/
public class SmallFileInfo {
/**
* 改行value是否使用
*/
private Boolean valueIsUse;
/**
* 当前行号
*/
private Long lineNum;
/**
* 当前一行值
*/
private String value;
/**
* 文件绝对路径
*/
private String fileAbsolutePath;
/**
* BufferedReader引用
*/
private BufferedReader reader;
public String readNextValue(){
//判断上一轮读取的值是否使用
if (this.valueIsUse) {
try {
String line = null;
if ((line = this.reader.readLine()) != null) {
this.value = line;
// System.out.println("当前读取行数为:" + this.lineNum + ",value为:" + this.value);
this.lineNum++;
this.valueIsUse = false;
return this.value;
}
//读完了
else {
return null;
}
} catch (IOException e) {
e.printStackTrace();
return null;
}
}
//否则返回上一轮读取的值
else {
return this.value;
}
}
public Boolean getValueIsUse() {
return valueIsUse;
}
public void setValueIsUse(Boolean valueIsUse) {
this.valueIsUse = valueIsUse;
}
public Long getLineNum() {
return lineNum;
}
public void setLineNum(Long lineNum) {
this.lineNum = lineNum;
}
public String getValue() {
return value;
}
public void setValue(String value) {
this.value = value;
}
public String getFileAbsolutePath() {
return fileAbsolutePath;
}
public void setFileAbsolutePath(String fileAbsolutePath) {
this.fileAbsolutePath = fileAbsolutePath;
}
public BufferedReader getReader() {
return reader;
}
public void setReader(BufferedReader reader) {
this.reader = reader;
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (obj == null || this.getClass() != obj.getClass()) {
return false;
}
SmallFileInfo fileInfo = (SmallFileInfo) obj;
return this.fileAbsolutePath.equals(fileInfo.getFileAbsolutePath());
}
@Override
public int hashCode() {
return Objects.hash(this.fileAbsolutePath);
}
}
再来说下排序原理
如图:(图片来源于网络,如果侵权,请联系我删除)
其他说明
- 该方法是线程不安全的
- 当前选用的是Long,若是规律的复杂字符串类型,自行修改排序方式即可
- 代码为按照解决思路自行编写,自行测试OK,若有BUG请联系我
- 代码还可以继续精简,若有更好的实现方式,请提出
- 执行代码耗时较长,测试时,建议十万数据量
- 如需转载,请注明来源