1. 生成用于测试排序的CSV文件
常量类 Const.java
package zhangchao.externalsort;
public class Const {
// 没有经过排序的原数据文件路径
public static final String ORIGIN_FILE = "D:\\testTemp\\origin_file.csv";
// 生成多少条数据
public static final int MAX_ITEMS = 100 * 10000;
// 输出文件路径
public static final String OUT_FILE = "D:\\testTemp\\out_file.csv";
// 临时的中间文件路径
public static final String TEMP_MIDDLE_FILE = "D:\\testTemp\\temp_middle_file.txt";
}
创建没有排序的CSV文件。CreateData.java
package zhangchao.preparedata;
import java.io.*;
import java.util.Random;
import java.util.UUID;
import zhangchao.externalsort.Const;
/**
* 创建一个没有排序的CSV数据文件
* @author zhangchao
*
*/
public class CreateData {
/**
* 随机生成字符串
* @return 随机生成字符串
*/
private static String genName() {
String[] arr = {
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
"n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M",
"N", "O", "P", "Q", "R", "S", "T", "u", "v", "W", "X", "Y", "Z",
"0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
"个", "好", "二", "黑", "科", "技", "地", "就", "里", "吗", "看", "图", "遇",
"啊", "吧", "版", "不", "别", "把", "被", "帮", "办", "过", "及", "奶", "胡"
};
Random random = new Random();
int size = 100 + random.nextInt(100);
StringBuilder sb = new StringBuilder();
for (int i = 0; i < size; i++) {
int index = random.nextInt(arr.length);
sb.append(arr[index]);
}
return sb.toString();
}
public static void main(String[] args) {
File file = new File(Const.ORIGIN_FILE);
if (file.exists()) {
file.delete();
}
try {
file.createNewFile();
} catch (IOException e) {
e.printStackTrace();
}
FileOutputStream fos = null;
BufferedWriter bw = null;
try {
fos = new FileOutputStream(file);
bw = new BufferedWriter(new OutputStreamWriter(fos, "UTF-8"));
// 生成CSV文件标题
String title = "id,name,price\r\n";
bw.write(title);
for (int i = 0; i < Const.MAX_ITEMS; i++) {
String id = UUID.randomUUID().toString().replaceAll("-", "");
String name = genName();
Random r = new Random();
double price = r.nextDouble() * 100.0 + 0.01;
StringBuilder sb = new StringBuilder();
sb.append(id).append(",").append(name).append(",").append(price)
.append("\r\n");
bw.write(sb.toString());
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (null != bw) {
bw.flush();
bw.close();
}
if (null != fos) {
fos.flush();
fos.close();
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
2. 做预备工作,编写基础类
为了方便读取CSV文件的每条记录,我们需要编写DTO,用来接受记录的各个属性。
ItemDto.java
package zhangchao.externalsort;
import java.math.BigDecimal;
/**
* 对应CSV文件每条记录的DTO类
* @author zhangchao
*
*/
public class ItemDto {
// 主键
private String id;
// 名称
private String name;
// 价格
private BigDecimal price;
// 文件中的位置
private Long filePosition;
@Override
public String toString() {
return "ItemDto [id=" + id + ", name=" + name + ", price=" + price + ", filePosition=" + filePosition + "]";
}
// setters/getters
public String getId() {
return id;
}
public Long getFilePosition() {
return filePosition;
}
public void setFilePosition(Long filePosition) {
this.filePosition = filePosition;
}
public void setId(String id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public BigDecimal getPrice() {
return price;
}
public void setPrice(BigDecimal price) {
this.price = price;
}
}
我们还需要编写一个类来传递生成临时中间文件的结果,类名是 TempMiddleFileResult 。
package zhangchao.externalsort;
/**
* 创建临时中间文件的结果
* @author zhangchao
*
*/
public class TempMiddleFileResult {
private boolean flag = false; // 是否成功
private int lines = 0; // 文件总行数。
public boolean getFlag() {
return flag;
}
public void setFlag(boolean flag) {
this.flag = flag;
}
public int getLines() {
return lines;
}
public void setLines(int lines) {
this.lines = lines;
}
}
检查输出文件是否正确排序的工具类 CheckResult :
package zhangchao.externalsort;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.math.BigDecimal;
public class CheckResult {
public static void main(String args[]){
File file = new File(Const.OUT_FILE);
if (!file.exists()) {
System.out.print(false);
return;
}
FileInputStream fis = null;
BufferedReader br = null;
try {
fis = new FileInputStream(file);
br = new BufferedReader(new InputStreamReader(fis, "UTF-8"));
String str = br.readLine();
if (str.startsWith("id,")){
str = br.readLine();
}
if (null == str){
System.out.print(false);
return;
}
ItemDto pre = null;
ItemDto current = null;
while (null != str) {
str = str.trim();
if (str.length() > 0 && !str.startsWith("id,")) {
pre = current;
String arr[] = str.split(",");
current = new ItemDto();
current.setId(arr[0]);
current.setName(arr[1]);
try {
current.setPrice(new BigDecimal(arr[2]));
} catch (Exception e) {
System.out.println(str);
throw e;
}
if (null != pre) {
BigDecimal prePrice = pre.getPrice();
BigDecimal currPrice = current.getPrice();
if (currPrice.compareTo(prePrice) < 0) {
System.out.println("pre=" + pre);
System.out.println("current=" + current);
System.out.println("currPrice.compareTo(prePrice) < 0");
System.out.println(false);
return;
}
}
}
str = br.readLine();
}
System.out.print(true);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (null != br) {
br.close();
}
if (null != fis) {
fis.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
3. 编写内部排序的类,用于做对比。
为了和内部排序做对比,而编写的Test1 类。Test1 类全部读取原文件的内容到内存中,并且进行排序,当文件过大就会因为内存不足报错。
Test1.java
package zhangchao.externalsort;
import java.util.List;
import java.util.ArrayList;
import java.io.*;
import java.math.BigDecimal;
import zhangchao.preparedata.CreateData;
public class Test1 {
public static void main(String[] args) {
File file = new File(Const.ORIGIN_FILE);
if (!file.exists()) {
System.out.println("No file");
return;
}
FileInputStream fis = null;
BufferedReader br = null;
FileOutputStream fos = null;
BufferedWriter bw = null;
try {
List<ItemDto> itemDtoList = new ArrayList<ItemDto>();
fis = new FileInputStream(file);
br = new BufferedReader(new InputStreamReader(fis, "UTF-8"));
String str = null;
str = br.readLine();
while(null != str) {
if (!str.startsWith("id,")) {
String arr[] = str.split(",");
ItemDto itemDto = new ItemDto();
itemDto.setId(arr[0]);
itemDto.setName(arr[1]);
itemDto.setPrice(new BigDecimal(arr[2]));
itemDtoList.add(itemDto);
}
str = br.readLine();
}
itemDtoList.sort((o1, o2) -> {
BigDecimal p1 = o1.getPrice();
BigDecimal p2 = o2.getPrice();
return p1.compareTo(p2);
});
File outFile = new File(Const.OUT_FILE);
fos = new FileOutputStream(outFile);
bw = new BufferedWriter(new OutputStreamWriter(fos, "UTF-8"));
String title = "id,name,price\r\n";
bw.write(title);
int index = 0;
for (ItemDto itemDto : itemDtoList) {
StringBuilder outputSb = new StringBuilder();
outputSb.append(itemDto.getId()).append(",")
.append(itemDto.getName()).append(",")
.append(itemDto.getPrice().toString()).append("\r\n");
bw.write(outputSb.toString());
index++;
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (null != br) {
br.close();
}
if (null != fis) {
fis.close();
}
if (null != bw) {
bw.flush();
bw.close();
}
if (null != fos) {
fos.flush();
fos.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
当原文件过大,Test1 类可能会报下面的错:
Exception in thread "main" java.lang.OutOfMemoryError: Java heap space
at java.util.Arrays.copyOfRange(Unknown Source)
at java.lang.String.<init>(Unknown Source)
at java.lang.String.substring(Unknown Source)
at java.lang.String.split(Unknown Source)
at java.lang.String.split(Unknown Source)
at zhangchao.externalsort.Test1.main(Test1.java:31)
4. 外部排序
假定文件内容有一百万行,大体思路如下:
第一步:生成临时中间文件。文件内容是一百万个字符串0或1组成。每个数字代表每条数据的删除状态。0表示未删除,1表示已删除。数字在临时文件的位置对应原文件的位置。
第二步:每次读取一万行,共分100次读取。每次读取的时候,前一万行创建一个最大堆,一万行后面的数据只要小于堆顶,就替换掉堆顶,然后调整堆。当读取文件结束后,把堆排序,输出到新的CSV文件中;同时更改临时中间文件中的删除标识,凡是输出到新的CSV文件中的记录都要改为已删除。
下面是代码,ExternalSort.java
package zhangchao.externalsort;
import java.io.*;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
/**
* 对硬盘上的文件进行外部排序
* @author zhangchao
*
*/
public class ExternalSort {
/**
* 创建临时的中间文件,保存删除标识。
* @param originFilePath 原始文件路径
* @param tempMiddleFilePath 临时中间文件的路径
* @return 结果对象
*/
private static TempMiddleFileResult createTempMiddleFile(final String originFilePath, final String tempMiddleFilePath) {
File originFile = new File(originFilePath);
File tempMiddleFile = new File(tempMiddleFilePath);
if (tempMiddleFile.exists()) {
tempMiddleFile.delete();
}
TempMiddleFileResult result = new TempMiddleFileResult();
FileInputStream fis = null;
BufferedReader br = null;
FileOutputStream fos = null;
BufferedWriter bw = null;
try {
tempMiddleFile.createNewFile();
fis = new FileInputStream(originFile);
br = new BufferedReader(new InputStreamReader(fis, "UTF-8"));
fos = new FileOutputStream(tempMiddleFile);
bw = new BufferedWriter(new OutputStreamWriter(fos, "UTF-8"));
int lines = 0;
String str = null;
str = br.readLine();
while(null != str) {
str = str.trim();
if (str.length() > 0 && !str.startsWith("id,")) {
byte b = 48; // 根据ASCII码,是字符串0.
fos.write(new byte[]{b});
lines ++;
}
str = br.readLine();
}
result.setFlag(true);
result.setLines(lines);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (null != br) {
br.close();
}
if (null != fis) {
fis.close();
}
if (null != bw) {
bw.flush();
bw.close();
}
if (null != fos) {
fos.flush();
fos.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
return result;
}
/**
* 创建堆
* @param list 要进行建堆的列表
* @param listSize 列表长度
* @param comparator 比较用的函数钩子
* @param <T> list中的元素类型
*/
private static<T> void createHeap(List<T> list, int listSize, Comparator<T> comparator) {
// 假设第0个元素已经是堆了,从第1个元素开始加入堆。
for (int i = 1; i < listSize; i++) {
int newIndex = i;
while (newIndex > 0) {
// int parentIndex = (newIndex - 1) / 2;
int parentIndex = (newIndex - 1) >> 1;
T parent = list.get(parentIndex);
T newNode = list.get(newIndex);
if (comparator.compare(newNode, parent) > 0) {
list.set(parentIndex, newNode);
list.set(newIndex, parent);
newIndex = parentIndex;
} else {
// 小于等于父亲节点,没有上升的需要,不需要再查找上级节点了。
newIndex = -1;
}
}
}
}
/**
* 在替换堆顶元素后,调整堆结构。
* @param heap 要进行排序的列表
* @param heapSize 列表长度
* @param comparator 比较用的函数钩子
* @param <T> list中的元素类型
*/
private static<T> void adjustHeapAfterSetTop(List<T> heap, int heapSize,
Comparator<T> comparator) {
int currentIndex = 0;
T current = heap.get(0);
boolean whileFlag = true;
while(whileFlag) {
int leftIndex = (currentIndex << 1) + 1;
int rightIndex = (currentIndex << 1) + 2;
if (rightIndex < heapSize) { // 左右子节点都有的情况.
T left = heap.get(leftIndex);
T right = heap.get(rightIndex);
int maxIndex = rightIndex;
T max = right;
if (comparator.compare(left, right) > 0) {
maxIndex = leftIndex;
max = left;
}
if (comparator.compare(max, current) > 0) {
heap.set(currentIndex, max);
heap.set(maxIndex, current);
currentIndex = maxIndex;
} else {
whileFlag = false;
}
} else if (leftIndex < heapSize) { // 只有左子节点的情况。
T left = heap.get(leftIndex);
if (comparator.compare(left, current) > 0) {
heap.set(currentIndex, left);
heap.set(leftIndex, current);
currentIndex = leftIndex;
} else {
whileFlag = false;
}
} else { // 没有子节点,终止循环。
whileFlag = false;
}
}
}
/**
* 每次只找出排名靠前的一部分。
* @param originFilePath 源文件路径
* @param tempMiddleFilePath 中间临时文件路径
* @param outFile 输出文件
* @param everyLines 每次处理的行数
*/
private static void sortFewLines(final String originFilePath, final String tempMiddleFilePath,
File outFile, final int everyLines) {
File originFile = new File(originFilePath);
File tempMiddleFile = new File(tempMiddleFilePath);
FileInputStream fisOrigin = null;
BufferedReader brOrigin = null;
RandomAccessFile raf = null;
FileWriter fw = null;
try {
// 用来保存文件中,排序靠前的 everyLines 个元素。
List<ItemDto> itemDtoList = new ArrayList<ItemDto>();
boolean isHeap = false; // 是否创建了堆结构。
fisOrigin = new FileInputStream(originFile);
brOrigin = new BufferedReader(new InputStreamReader(fisOrigin, "UTF-8"));
raf = new RandomAccessFile(tempMiddleFile, "rw");
fw = new FileWriter(outFile, true);
String originStr = brOrigin.readLine();
long index = 0;
while (null != originStr) {
originStr = originStr.trim();
if (originStr.length() > 0 && !originStr.startsWith("id,")) {
raf.seek(index);
// 获取数据的删除标识。
byte isDelete = raf.readByte();
// 如果没有删除,就继续。
if (48 == isDelete) {
String arr[] = originStr.split(",");
ItemDto itemDto = new ItemDto();
itemDto.setId(arr[0]);
itemDto.setName(arr[1]);
itemDto.setPrice(new BigDecimal(arr[2]));
itemDto.setFilePosition(index);
int size = itemDtoList.size();
if (size < everyLines) {
itemDtoList.add(itemDto);
} else {
// itemDtoList 已经装了 everyLines 个元素,并且还没有建立堆的时候,建立堆。
if (!isHeap) {
createHeap(itemDtoList, itemDtoList.size(),
(o1,o2)->o1.getPrice().compareTo(o2.getPrice()) );
isHeap = true;
}
// 如果后面的元素有比堆顶小的,替换掉堆顶,并且调整堆结构。 zhangchao
// 每次调整对结果,算法复杂度是 以2为底N的对数。
ItemDto heapTop = itemDtoList.get(0);
if (itemDto.getPrice().compareTo(heapTop.getPrice())< 0) {
itemDtoList.set(0, itemDto);
adjustHeapAfterSetTop(itemDtoList, size,
(o1,o2)->o1.getPrice().compareTo(o2.getPrice())
);
}
}
}
index ++;
} // end if (originStr.length() > 0 && !originStr.startsWith("id,")) {
originStr = brOrigin.readLine();
}
if (null != itemDtoList && !itemDtoList.isEmpty()) {
itemDtoList.sort( (o1,o2)->o1.getPrice().compareTo(o2.getPrice()) );
fw.append("id,name,price\r\n");
for (ItemDto item : itemDtoList) {
raf.seek(item.getFilePosition());
raf.write(new byte[]{49}); // 把删除标识改成已删除。
String id = item.getId();
String name = item.getName();
BigDecimal price = item.getPrice();
StringBuilder sb = new StringBuilder();
sb.append(id).append(",").append(name).append(",").append(price)
.append("\r\n");
fw.append(sb.toString());
fw.flush();
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (null != brOrigin) {
brOrigin.close();
}
if (null != fisOrigin) {
fisOrigin.close();
}
if (null != raf) {
raf.close();
}
if (null != fw) {
fw.flush();
fw.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* 利用堆来进行外部排序
* @param originFilePath
* @param tempMiddleFilePath
* @param outFilePath
*/
public static void sortByHeap(final String originFilePath, final String tempMiddleFilePath,
final String outFilePath) {
TempMiddleFileResult tempMiddleFileResult = createTempMiddleFile(originFilePath,
tempMiddleFilePath);
if (!tempMiddleFileResult.getFlag()) {
System.out.println("创建中间文件失败!");
return;
}
// 创建输出文件
File outFile = new File(outFilePath);
if (outFile.exists()) {
outFile.delete();
}
boolean outFileFlag = false;
try {
outFile.createNewFile();
outFileFlag = true;
} catch (IOException e) {
outFileFlag = false;
e.printStackTrace();
}
if (!outFileFlag) {
System.out.println("创建输出文件失败!");
return;
}
// 每次读取的行数
final int everyLines = 10000;
int lines = tempMiddleFileResult.getLines();
System.out.println("lines=" + lines);
int times = 0;
if (lines % everyLines == 0) {
times = lines / everyLines;
} else {
times = lines / everyLines + 1;
}
for (int i = 0; i < times; i++) {
System.out.println("i in times = " + i);
sortFewLines( originFilePath, tempMiddleFilePath, outFile, everyLines);
}
}
public static void main(String args[]) {
long t1 = System.currentTimeMillis();
sortByHeap(Const.ORIGIN_FILE, Const.TEMP_MIDDLE_FILE, Const.OUT_FILE);
long t2 = System.currentTimeMillis();
long diss = t2 - t1;
long min = diss / 1000L / 60L;
long sec = (diss - (min * 60L * 1000L)) / 1000L;
System.out.print(min + " 分钟 " + sec + " 秒");
}
}
输出结果:
Picked up JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8
lines=1000000
i in times = 0
i in times = 1
i in times = 2
i in times = 3
i in times = 4
i in times = 5
i in times = 6
i in times = 7
i in times = 8
i in times = 9
i in times = 10
i in times = 11
i in times = 12
i in times = 13
i in times = 14
i in times = 15
i in times = 16
i in times = 17
i in times = 18
i in times = 19
i in times = 20
i in times = 21
i in times = 22
i in times = 23
i in times = 24
i in times = 25
i in times = 26
i in times = 27
i in times = 28
i in times = 29
i in times = 30
i in times = 31
i in times = 32
i in times = 33
i in times = 34
i in times = 35
i in times = 36
i in times = 37
i in times = 38
i in times = 39
i in times = 40
i in times = 41
i in times = 42
i in times = 43
i in times = 44
i in times = 45
i in times = 46
i in times = 47
i in times = 48
i in times = 49
i in times = 50
i in times = 51
i in times = 52
i in times = 53
i in times = 54
i in times = 55
i in times = 56
i in times = 57
i in times = 58
i in times = 59
i in times = 60
i in times = 61
i in times = 62
i in times = 63
i in times = 64
i in times = 65
i in times = 66
i in times = 67
i in times = 68
i in times = 69
i in times = 70
i in times = 71
i in times = 72
i in times = 73
i in times = 74
i in times = 75
i in times = 76
i in times = 77
i in times = 78
i in times = 79
i in times = 80
i in times = 81
i in times = 82
i in times = 83
i in times = 84
i in times = 85
i in times = 86
i in times = 87
i in times = 88
i in times = 89
i in times = 90
i in times = 91
i in times = 92
i in times = 93
i in times = 94
i in times = 95
i in times = 96
i in times = 97
i in times = 98
i in times = 99
11 分钟 10 秒