归并排序算法介绍,请参照Wikipeida
zh.wikipedia.org/wiki/%E5%BD%92%E5%B9%B6%E6%8E%92%E5%BA%8F
基本思想:
大文件分割成行数相等的两个子文件,递归(归并排序)两个子文件,直到递归到分割成的子文件低于限制行数
低于限制行数的子文件直接排序
两个排序好的子文件归并到父文件
直到最后所有排序好的父文件归并到输入的大文件并返回
之前看了网上很多示例代码,写的很不简洁, 引入了过多的临时变量i, j, k等等, 导致程序基本没法看,
只好自己写了一个,没有很关心执行效率, 只求够用,以后有机会再优化一下吧。
Performance:
输入999999行 on
Intel(R) Core(TM)2 Duo CPU @ 2.66GHz 硬盘 5400转
cost: 8951 MILLISECONDS
cost: 8 MICROSECONDS per line
JDK要求
Java 8
package com.java.sort.merge;
import com.google.common.base.Charsets;
import com.google.common.base.Stopwatch;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterators;
import com.google.common.collect.PeekingIterator;
import com.google.common.io.Files;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.filefilter.AndFileFilter;
import org.apache.commons.io.filefilter.PrefixFileFilter;
import org.apache.commons.io.filefilter.SuffixFileFilter;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
import java.io.*;
import java.util.Iterator;
import java.util.TreeSet;
import java.util.concurrent.TimeUnit;
import java.util.stream.Stream;
public class FileMergeSort {
private static final Logger log = LogManager.getLogger();
private static final long total = 999999L;
private static final int limit = 99999;
private static void cleanTempFiles() {
FilenameFilter filter = new AndFileFilter(ImmutableList.of(new PrefixFileFilter("sort"), new SuffixFileFilter(".part")));
ImmutableList.copyOf(FileUtils.getTempDirectory().listFiles(filter)).forEach(File::delete);
}
private static long lineNumber(File input) throws IOException {
try (Stream<String> stream = Files.newReader(input, Charsets.UTF_8).lines()) {
return stream.count();
}
}
private static File split(File input, long from, long to) throws IOException {
File part = File.createTempFile("sort", ".part");
long lineNumber = 0L;
String line = null;
try (Stream<String> stream = Files.newReader(input, Charsets.UTF_8).lines();
Writer writer = Files.newWriter(part, Charsets.UTF_8)) {
Iterator<String> lineIterator = stream.iterator();
while (lineIterator.hasNext() && lineNumber <= to) {
line = lineIterator.next();
if (lineNumber >= from) {
writer.write(line.concat(IOUtils.LINE_SEPARATOR));
}
lineNumber++;
}
}
return part;
}
private static File merge(File source, File left, File right) throws IOException {
try (Stream<String> leftStream = Files.newReader(left, Charsets.UTF_8).lines();
Stream<String> rightStream = Files.newReader(right, Charsets.UTF_8).lines();
Writer writer = Files.newWriter(source, Charsets.UTF_8)) {
PeekingIterator<String> leftPeekingIterator = Iterators.peekingIterator(leftStream.iterator());
PeekingIterator<String> rightPeekingIterator = Iterators.peekingIterator(rightStream.iterator());
String leftLine, rightLine;
writer.write("");
while (leftPeekingIterator.hasNext() && rightPeekingIterator.hasNext()) {
leftLine = leftPeekingIterator.peek();
rightLine = rightPeekingIterator.peek();
if (leftLine.compareTo(rightLine) < 0) {
writer.append(leftLine.concat(IOUtils.LINE_SEPARATOR));
leftPeekingIterator.next();
} else {
writer.append(rightLine.concat(IOUtils.LINE_SEPARATOR));
rightPeekingIterator.next();
}
}
while (leftPeekingIterator.hasNext()) {
writer.append(leftPeekingIterator.next().concat(IOUtils.LINE_SEPARATOR));
}
while (rightPeekingIterator.hasNext()) {
writer.append(rightPeekingIterator.next().concat(IOUtils.LINE_SEPARATOR));
}
}
return source;
}
private static File inMemorySort(File input) throws IOException {
TreeSet<String> lines = new TreeSet<>(String::compareTo);
try (BufferedReader reader = Files.newReader(input, Charsets.UTF_8);) {
String line;
while ((line = reader.readLine()) != null) {
lines.add(line);
}
}
FileUtils.writeLines(input, lines);
return input;
}
public static File mergeSort(File input) throws IOException {
long total = lineNumber(input);
if (total <= limit) {
return inMemorySort(input);
}
long half = total / 2L;
File left = mergeSort(split(input, 0, half));
File right = mergeSort(split(input, half + 1, total));
return merge(input, left, right);
}
@BeforeClass
public static void init() throws IOException {
cleanTempFiles();
int minLength = String.valueOf(total).length();
try (Writer writer = Files.newWriter(new File("z:\\long.txt"), Charsets.UTF_8)) {
writer.write("");
for (long i = total; i > 0L; i--) {
writer.append(Strings.padStart(String.valueOf(i), minLength, '0').concat(IOUtils.LINE_SEPARATOR));
}
}
}
@AfterClass
public static void clean() {
cleanTempFiles();
}
@Test
public void testSort() throws IOException {
Stopwatch watch = Stopwatch.createStarted();
File sorted = mergeSort(new File("z:\\long.txt"));
watch.stop();
log.info(String.format("cost: %s MILLISECONDS", watch.elapsed(TimeUnit.MILLISECONDS)));
log.info(String.format("cost: %s MICROSECONDS per line", watch.elapsed(TimeUnit.MICROSECONDS) / total));
}
}
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.java.app</groupId> <artifactId>sample</artifactId> <version>1.0-SNAPSHOT</version> <dependencies> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>2.4</version> </dependency> <dependency> <groupId>com.google.guava</groupId> <artifactId>guava</artifactId> <version>18.0</version> </dependency> <dependency> <groupId>org.apache.logging.log4j</groupId> <artifactId>log4j-api</artifactId> <version>2.1</version> </dependency> <dependency> <groupId>org.apache.logging.log4j</groupId> <artifactId>log4j-core</artifactId> <version>2.1</version> </dependency> <dependency> <groupId>org.apache.logging.log4j</groupId> <artifactId>log4j-jcl</artifactId> <version>2.1</version> </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.12</version> <scope>test</scope> </dependency> </dependencies> </project>