【181】Java8实现单线程外部排序

最新推荐文章于 2023-02-19 20:37:42 发布

zhangchao19890805

最新推荐文章于 2023-02-19 20:37:42 发布

阅读量299

点赞数

分类专栏： JAVA 算法文章标签： java 算法开发语言

本文链接：https://blog.csdn.net/zhangchao19890805/article/details/128489433

版权

JAVA 同时被 2 个专栏收录

58 篇文章 1 订阅

订阅专栏

算法

24 篇文章 0 订阅

订阅专栏

1. 生成用于测试排序的CSV文件

常量类 Const.java

package zhangchao.externalsort;

public class Const {
    // 没有经过排序的原数据文件路径
	public static final String ORIGIN_FILE = "D:\\testTemp\\origin_file.csv";
	
	// 生成多少条数据
	public static final int MAX_ITEMS = 100 * 10000;
	
	// 输出文件路径
	public static final String OUT_FILE = "D:\\testTemp\\out_file.csv";
	
	// 临时的中间文件路径
	public static final String TEMP_MIDDLE_FILE = "D:\\testTemp\\temp_middle_file.txt";
}

创建没有排序的CSV文件。CreateData.java

package zhangchao.preparedata;

import java.io.*;
import java.util.Random;
import java.util.UUID;

import zhangchao.externalsort.Const;

/**
 * 创建一个没有排序的CSV数据文件
 * @author zhangchao
 *
 */
public class CreateData {
	
	/**
	 * 随机生成字符串
	 * @return 随机生成字符串
	 */
	private static String genName() {
		String[] arr = {
			"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", 
			"n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
			"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", 
			"N", "O", "P", "Q", "R", "S", "T", "u", "v", "W", "X", "Y", "Z",
			"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", 
			"个", "好", "二", "黑", "科", "技", "地", "就", "里", "吗", "看", "图", "遇", 
			"啊", "吧", "版", "不", "别", "把", "被", "帮", "办", "过", "及", "奶", "胡"
		};
		Random random = new Random();
		int size = 100 + random.nextInt(100);
		StringBuilder sb = new StringBuilder();
		for (int i = 0; i < size; i++) {
			int index = random.nextInt(arr.length);
			sb.append(arr[index]);
		}
		return sb.toString();
	}

	
	public static void main(String[] args) {
		File file = new File(Const.ORIGIN_FILE);
		if (file.exists()) {
			file.delete();
		}
		try {
			file.createNewFile();
		} catch (IOException e) {
			e.printStackTrace();
		}
		
		FileOutputStream fos = null;
		BufferedWriter bw = null;
		try {
			fos = new FileOutputStream(file);
			bw = new BufferedWriter(new OutputStreamWriter(fos, "UTF-8"));
			// 生成CSV文件标题
			String title = "id,name,price\r\n";
			bw.write(title);
			
			for (int i = 0; i < Const.MAX_ITEMS; i++) {
				String id = UUID.randomUUID().toString().replaceAll("-", "");
				String name = genName();
				Random r = new Random();
				double price = r.nextDouble() * 100.0 + 0.01;
				StringBuilder sb = new StringBuilder();
				sb.append(id).append(",").append(name).append(",").append(price)
				.append("\r\n");
				bw.write(sb.toString());
			}
			
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				if (null != bw) {
					bw.flush();
					bw.close();
				}
				
				if (null != fos) {
					fos.flush();
					fos.close();
				}
				
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}
		
	}

}

2. 做预备工作，编写基础类

为了方便读取CSV文件的每条记录，我们需要编写DTO，用来接受记录的各个属性。

ItemDto.java

package zhangchao.externalsort;

import java.math.BigDecimal;

/**
 * 对应CSV文件每条记录的DTO类
 * @author zhangchao
 *
 */
public class ItemDto {
	// 主键
	private String id;
	// 名称
	private String name;
	// 价格
	private BigDecimal price;
	// 文件中的位置
	private Long filePosition;
	
	
	
	@Override
	public String toString() {
		return "ItemDto [id=" + id + ", name=" + name + ", price=" + price + ", filePosition=" + filePosition + "]";
	}
	
	
	//      setters/getters
	

	public String getId() {
		return id;
	}
	public Long getFilePosition() {
		return filePosition;
	}
	public void setFilePosition(Long filePosition) {
		this.filePosition = filePosition;
	}
	public void setId(String id) {
		this.id = id;
	}
	public String getName() {
		return name;
	}
	public void setName(String name) {
		this.name = name;
	}
	public BigDecimal getPrice() {
		return price;
	}
	public void setPrice(BigDecimal price) {
		this.price = price;
	}
}

我们还需要编写一个类来传递生成临时中间文件的结果，类名是 TempMiddleFileResult 。

package zhangchao.externalsort;

/**
 * 创建临时中间文件的结果
 * @author zhangchao
 *
 */
public class TempMiddleFileResult {
	private boolean flag = false; // 是否成功
	private int lines = 0; // 文件总行数。
	public boolean getFlag() {
		return flag;
	}
	public void setFlag(boolean flag) {
		this.flag = flag;
	}
	public int getLines() {
		return lines;
	}
	public void setLines(int lines) {
		this.lines = lines;
	}
	
	
}

检查输出文件是否正确排序的工具类 CheckResult ：

package zhangchao.externalsort;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.math.BigDecimal;

public class CheckResult {
	public static void main(String args[]){

		File file = new File(Const.OUT_FILE);
		if (!file.exists()) {
			System.out.print(false);
			return;
		}
		FileInputStream fis = null;
		BufferedReader br = null;
		try {
			fis = new FileInputStream(file);
			br = new BufferedReader(new InputStreamReader(fis, "UTF-8"));
			String str = br.readLine();
			if (str.startsWith("id,")){
				str = br.readLine();
			}
			if (null == str){
				System.out.print(false);
				return;
			}
			ItemDto pre = null;
			ItemDto current = null;
			while (null != str) {
				str = str.trim();
				if (str.length() > 0 && !str.startsWith("id,")) {
					pre = current;
					String arr[] = str.split(",");
					current = new ItemDto();
					current.setId(arr[0]);
					current.setName(arr[1]);
					try {
						current.setPrice(new BigDecimal(arr[2]));
					} catch (Exception e) {
						System.out.println(str);
						throw e;
					}
					
					if (null != pre) {
						BigDecimal prePrice = pre.getPrice();
						BigDecimal currPrice = current.getPrice();
						if (currPrice.compareTo(prePrice) < 0) {
							System.out.println("pre=" + pre);
							System.out.println("current=" + current);
							System.out.println("currPrice.compareTo(prePrice) < 0");
							System.out.println(false);
							return;
						}
					}
				}
				str = br.readLine();
			}
			System.out.print(true);
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				if (null != br) {
					br.close();
				}
				if (null != fis) {
					fis.close();
				}
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
		
	}
}

3. 编写内部排序的类，用于做对比。

为了和内部排序做对比，而编写的Test1 类。Test1 类全部读取原文件的内容到内存中，并且进行排序，当文件过大就会因为内存不足报错。

Test1.java

package zhangchao.externalsort;

import java.util.List;
import java.util.ArrayList;
import java.io.*;
import java.math.BigDecimal;

import zhangchao.preparedata.CreateData;


public class Test1 {

	public static void main(String[] args) {
		File file = new File(Const.ORIGIN_FILE);
		if (!file.exists()) {
			System.out.println("No file");
			return;
		}
		FileInputStream fis = null;
		BufferedReader br = null;
		FileOutputStream fos = null;
		BufferedWriter bw = null;
		try {
			List<ItemDto> itemDtoList = new ArrayList<ItemDto>();
			fis = new FileInputStream(file);
			br = new BufferedReader(new InputStreamReader(fis, "UTF-8"));
			String str = null;
			str = br.readLine();
			while(null != str) {
				if (!str.startsWith("id,")) {
					String arr[] = str.split(",");
					ItemDto itemDto = new ItemDto();
					itemDto.setId(arr[0]);
					itemDto.setName(arr[1]);
					itemDto.setPrice(new BigDecimal(arr[2]));
					itemDtoList.add(itemDto);
				}
				str = br.readLine();
			}
			itemDtoList.sort((o1, o2) -> {
				BigDecimal p1 = o1.getPrice();
				BigDecimal p2 = o2.getPrice();
				return p1.compareTo(p2);
			});
			
			File outFile = new File(Const.OUT_FILE);
			fos = new FileOutputStream(outFile);
			bw = new BufferedWriter(new OutputStreamWriter(fos, "UTF-8"));
			String title = "id,name,price\r\n";
			bw.write(title);
			int index = 0;
			for (ItemDto itemDto : itemDtoList) {
				StringBuilder outputSb = new StringBuilder();
				outputSb.append(itemDto.getId()).append(",")
				.append(itemDto.getName()).append(",")
				.append(itemDto.getPrice().toString()).append("\r\n");
				bw.write(outputSb.toString());
				index++;
			}
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				if (null != br) {
					br.close();
				}
				if (null != fis) {
					fis.close();
				}
				if (null != bw) {
					bw.flush();
					bw.close();
				}
				if (null != fos) {
					fos.flush();
					fos.close();
				}
			} catch (IOException e) {
				e.printStackTrace();
			}
			
		}
	}

}

当原文件过大，Test1 类可能会报下面的错：

Exception in thread "main" java.lang.OutOfMemoryError: Java heap space
	at java.util.Arrays.copyOfRange(Unknown Source)
	at java.lang.String.<init>(Unknown Source)
	at java.lang.String.substring(Unknown Source)
	at java.lang.String.split(Unknown Source)
	at java.lang.String.split(Unknown Source)
	at zhangchao.externalsort.Test1.main(Test1.java:31)

4. 外部排序

假定文件内容有一百万行，大体思路如下：

第一步：生成临时中间文件。文件内容是一百万个字符串0或1组成。每个数字代表每条数据的删除状态。0表示未删除，1表示已删除。数字在临时文件的位置对应原文件的位置。

第二步：每次读取一万行，共分100次读取。每次读取的时候，前一万行创建一个最大堆，一万行后面的数据只要小于堆顶，就替换掉堆顶，然后调整堆。当读取文件结束后，把堆排序，输出到新的CSV文件中；同时更改临时中间文件中的删除标识，凡是输出到新的CSV文件中的记录都要改为已删除。

下面是代码，ExternalSort.java

package zhangchao.externalsort;

import java.io.*;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;

/**
 * 对硬盘上的文件进行外部排序
 * @author zhangchao
 *
 */
public class ExternalSort {
	
	/**
	 * 创建临时的中间文件，保存删除标识。
	 * @param originFilePath 原始文件路径
	 * @param tempMiddleFilePath 临时中间文件的路径
	 * @return 结果对象
	 */
	private static TempMiddleFileResult createTempMiddleFile(final String originFilePath, final String tempMiddleFilePath) {
		File originFile = new File(originFilePath);
		File tempMiddleFile = new File(tempMiddleFilePath);
		if (tempMiddleFile.exists()) {
			tempMiddleFile.delete();
		}
		TempMiddleFileResult result = new TempMiddleFileResult();
		FileInputStream fis = null;
		BufferedReader br = null;
		FileOutputStream fos = null;
		BufferedWriter bw = null;
		try {
			tempMiddleFile.createNewFile();
			
			fis = new FileInputStream(originFile);
			br = new BufferedReader(new InputStreamReader(fis, "UTF-8"));
			fos = new FileOutputStream(tempMiddleFile);
			bw = new BufferedWriter(new OutputStreamWriter(fos, "UTF-8"));
			
			int lines = 0;
			String str = null;
			str = br.readLine();
			while(null != str) {
				str = str.trim();
				if (str.length() > 0 && !str.startsWith("id,")) {
					byte b = 48; // 根据ASCII码，是字符串0.
					fos.write(new byte[]{b});
					lines ++;
				}
				str = br.readLine();
			}
			result.setFlag(true);
			result.setLines(lines);
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				if (null != br) {
					br.close();
				}
				if (null != fis) {
					fis.close();
				}
				if (null != bw) {
					bw.flush();
					bw.close();
				}
				if (null != fos) {
					fos.flush();
					fos.close();
				}
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
		return result;
	}
	
	/**
     * 创建堆
     * @param list 要进行建堆的列表
     * @param listSize 列表长度
     * @param comparator 比较用的函数钩子
     * @param <T> list中的元素类型
     */
    private static<T> void createHeap(List<T> list, int listSize, Comparator<T> comparator) {
        // 假设第0个元素已经是堆了，从第1个元素开始加入堆。
        for (int i = 1; i < listSize; i++) {
            int newIndex = i;
            while (newIndex > 0) {
//              int parentIndex = (newIndex - 1) / 2;
                int parentIndex = (newIndex - 1) >> 1;
                T parent = list.get(parentIndex);
                T newNode = list.get(newIndex);
                if (comparator.compare(newNode, parent) > 0) {
                    list.set(parentIndex, newNode);
                    list.set(newIndex, parent);
                    newIndex = parentIndex;
                } else {
                    // 小于等于父亲节点，没有上升的需要，不需要再查找上级节点了。
                    newIndex = -1;
                }
            }
        }
    }
    
    /**
     * 在替换堆顶元素后，调整堆结构。
     * @param heap 要进行排序的列表
     * @param heapSize 列表长度
     * @param comparator 比较用的函数钩子
     * @param <T> list中的元素类型
     */
    private static<T> void adjustHeapAfterSetTop(List<T> heap, int heapSize, 
    		Comparator<T> comparator) {
    	int currentIndex = 0;
    	T current = heap.get(0);
        boolean whileFlag = true;
        while(whileFlag) {
            int leftIndex = (currentIndex << 1) + 1;
            int rightIndex = (currentIndex << 1) + 2;
            if (rightIndex < heapSize) { // 左右子节点都有的情况.
                T left = heap.get(leftIndex);
                T right = heap.get(rightIndex);
                int maxIndex = rightIndex;
                T max = right;
                if (comparator.compare(left, right) > 0) {
                    maxIndex = leftIndex;
                    max = left;
                }
                if (comparator.compare(max, current) > 0) {
                    heap.set(currentIndex, max);
                    heap.set(maxIndex, current);
                    currentIndex = maxIndex;
                } else {
                    whileFlag = false;
                }
            } else if (leftIndex < heapSize) { // 只有左子节点的情况。
                T left = heap.get(leftIndex);
                if (comparator.compare(left, current) > 0) {
                    heap.set(currentIndex, left);
                    heap.set(leftIndex, current);
                    currentIndex = leftIndex;
                } else {
                    whileFlag = false;
                }
            } else { // 没有子节点，终止循环。
                whileFlag = false;
            }
        }
    }

	
    /**
     * 每次只找出排名靠前的一部分。
     * @param originFilePath 源文件路径
     * @param tempMiddleFilePath  中间临时文件路径
     * @param outFile  输出文件
     * @param everyLines 每次处理的行数
     */
	private static void sortFewLines(final String originFilePath, final String tempMiddleFilePath,
			File outFile, final int everyLines) {
		File originFile = new File(originFilePath);
		File tempMiddleFile = new File(tempMiddleFilePath);
		
		FileInputStream fisOrigin = null;
		BufferedReader brOrigin = null;
		RandomAccessFile raf = null;
		FileWriter fw = null;
		
		try {	
			// 用来保存文件中，排序靠前的 everyLines 个元素。
			List<ItemDto> itemDtoList = new ArrayList<ItemDto>();
			boolean isHeap = false; // 是否创建了堆结构。
			
			fisOrigin = new FileInputStream(originFile);
			brOrigin = new BufferedReader(new InputStreamReader(fisOrigin, "UTF-8"));
			raf = new RandomAccessFile(tempMiddleFile, "rw");
			fw = new FileWriter(outFile, true);
			
			String originStr = brOrigin.readLine();
			long index = 0;
			while (null != originStr) {
				originStr = originStr.trim();
				if (originStr.length() > 0 && !originStr.startsWith("id,")) {
					raf.seek(index);
					// 获取数据的删除标识。
					byte isDelete = raf.readByte();
					// 如果没有删除，就继续。
					if (48 == isDelete) {
						String arr[] = originStr.split(",");
						ItemDto itemDto = new ItemDto();
						itemDto.setId(arr[0]);
						itemDto.setName(arr[1]);
						itemDto.setPrice(new BigDecimal(arr[2]));
						itemDto.setFilePosition(index);
						int size = itemDtoList.size();
						if (size < everyLines) {
							itemDtoList.add(itemDto);							
						} else {
							// itemDtoList 已经装了 everyLines 个元素，并且还没有建立堆的时候，建立堆。
							if (!isHeap) {
								createHeap(itemDtoList, itemDtoList.size(), 
										(o1,o2)->o1.getPrice().compareTo(o2.getPrice()) );
								isHeap = true;
							}
							// 如果后面的元素有比堆顶小的，替换掉堆顶，并且调整堆结构。   zhangchao  
							// 每次调整对结果，算法复杂度是 以2为底N的对数。
							ItemDto heapTop = itemDtoList.get(0);
							if (itemDto.getPrice().compareTo(heapTop.getPrice())< 0) {
								itemDtoList.set(0, itemDto);
								adjustHeapAfterSetTop(itemDtoList, size, 
										(o1,o2)->o1.getPrice().compareTo(o2.getPrice())
								);
							}
						}
					}
					index ++;
				} // end if (originStr.length() > 0 && !originStr.startsWith("id,")) {
				originStr = brOrigin.readLine();
			}
			
			if (null != itemDtoList && !itemDtoList.isEmpty()) {
				itemDtoList.sort( (o1,o2)->o1.getPrice().compareTo(o2.getPrice()) );
				fw.append("id,name,price\r\n");
				for (ItemDto item : itemDtoList) {
					raf.seek(item.getFilePosition());
					raf.write(new byte[]{49}); // 把删除标识改成已删除。
					String id = item.getId();
					String name = item.getName();
					BigDecimal price = item.getPrice();
					StringBuilder sb = new StringBuilder();
					sb.append(id).append(",").append(name).append(",").append(price)
					.append("\r\n");
					fw.append(sb.toString());
					fw.flush();
				}
			}
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				if (null != brOrigin) {
					brOrigin.close();
				}
				if (null != fisOrigin) {
					fisOrigin.close();
				}
				if (null != raf) {
					raf.close();
				}
				if (null != fw) {
					fw.flush();
					fw.close();
				}
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
		
	}
	
	/**
	 * 利用堆来进行外部排序
	 * @param originFilePath
	 * @param tempMiddleFilePath
	 * @param outFilePath
	 */
	public static void sortByHeap(final String originFilePath, final String tempMiddleFilePath,
			final String outFilePath) {
		TempMiddleFileResult tempMiddleFileResult = createTempMiddleFile(originFilePath, 
				tempMiddleFilePath);
		if (!tempMiddleFileResult.getFlag()) {
			System.out.println("创建中间文件失败！");
			return;
		}
		// 创建输出文件
		File outFile = new File(outFilePath);
		if (outFile.exists()) {
			outFile.delete();
		}
		boolean outFileFlag = false;
		try {
			outFile.createNewFile();
			outFileFlag = true;
		} catch (IOException e) {
			outFileFlag = false;
			e.printStackTrace();
		}
		if (!outFileFlag) {
			System.out.println("创建输出文件失败！");
			return;
		}
		// 每次读取的行数
		final int everyLines = 10000;
		int lines = tempMiddleFileResult.getLines();
		System.out.println("lines=" + lines);
		int times = 0;
		if (lines % everyLines == 0) {
			times = lines / everyLines;
		} else {
			times = lines / everyLines + 1;
		}
		
		for (int i = 0; i < times; i++) {
			System.out.println("i in times = " + i);
			sortFewLines( originFilePath, tempMiddleFilePath, outFile, everyLines);
		}
	}
	
	public static void main(String args[]) {
		long t1 = System.currentTimeMillis();
		sortByHeap(Const.ORIGIN_FILE, Const.TEMP_MIDDLE_FILE, Const.OUT_FILE);
		long t2 = System.currentTimeMillis();
		long diss = t2 - t1;
		long min = diss / 1000L / 60L;
		long sec = (diss - (min * 60L * 1000L)) / 1000L; 
		System.out.print(min + " 分钟 " + sec + " 秒");
	}
}

输出结果：


Picked up JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8
lines=1000000
i in times = 0
i in times = 1
i in times = 2
i in times = 3
i in times = 4
i in times = 5
i in times = 6
i in times = 7
i in times = 8
i in times = 9
i in times = 10
i in times = 11
i in times = 12
i in times = 13
i in times = 14
i in times = 15
i in times = 16
i in times = 17
i in times = 18
i in times = 19
i in times = 20
i in times = 21
i in times = 22
i in times = 23
i in times = 24
i in times = 25
i in times = 26
i in times = 27
i in times = 28
i in times = 29
i in times = 30
i in times = 31
i in times = 32
i in times = 33
i in times = 34
i in times = 35
i in times = 36
i in times = 37
i in times = 38
i in times = 39
i in times = 40
i in times = 41
i in times = 42
i in times = 43
i in times = 44
i in times = 45
i in times = 46
i in times = 47
i in times = 48
i in times = 49
i in times = 50
i in times = 51
i in times = 52
i in times = 53
i in times = 54
i in times = 55
i in times = 56
i in times = 57
i in times = 58
i in times = 59
i in times = 60
i in times = 61
i in times = 62
i in times = 63
i in times = 64
i in times = 65
i in times = 66
i in times = 67
i in times = 68
i in times = 69
i in times = 70
i in times = 71
i in times = 72
i in times = 73
i in times = 74
i in times = 75
i in times = 76
i in times = 77
i in times = 78
i in times = 79
i in times = 80
i in times = 81
i in times = 82
i in times = 83
i in times = 84
i in times = 85
i in times = 86
i in times = 87
i in times = 88
i in times = 89
i in times = 90
i in times = 91
i in times = 92
i in times = 93
i in times = 94
i in times = 95
i in times = 96
i in times = 97
i in times = 98
i in times = 99
11 分钟 10 秒