import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class CountWordsOfArticle {
public void countWordsOfArticle(String fileName, int arraySize) throws IOException {
File file = new File(fileName);
if (!file.exists()) {
System.out.println("该文件不存在");
return;
}
MappedBiggerFileReader reader = new MappedBiggerFileReader(fileName, arraySize);
while (reader.read() != -1) {
wordCount(reader);
}
}
private static void wordCount(MappedBiggerFileReader reader) throws IOException {
Map<String, Integer> map = new ConcurrentHashMap<>();
BufferedReader in = new BufferedReader(new InputStreamReader(reader));
StringBuffer buffer = new StringBuffer();
String line = " ";
while ((line = in.readLine()) != null) {
buffer.append(line);
}
String request = buffer.toString();
Pattern p = Pattern.compile("[, . ; ! ? ]");
Matcher m = p.matcher(request);
String[] strs = p.split(request);
for (int i = 0; i < strs.length; i++) {
if (map.containsKey(strs[i].toLowerCase())) {
map.put(strs[i].toLowerCase(), map.get(strs[i].toLowerCase()) + 1);
} else {
map.put(strs[i].toLowerCase(), 1);
}
}
List<Map.Entry<String, Integer>> result = map.entrySet().stream()
.sorted(new Comparator<Map.Entry<String, Integer>>() {
@Override
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
return o2.getValue().compareTo(o1.getValue());
}
}).collect(Collectors.toList());
result.forEach(item -> {
System.out.println(item.getKey() + " " + item.getValue());
});
}
public class MappedBiggerFileReader extends InputStream{
private MappedByteBuffer[] mappedBufArray;
private int count = 0;
private int number;
private FileInputStream fileIn;
private long fileLength;
private int arraySize;
private byte[] array;
public MappedBiggerFileReader(String fileName, int arraySize) throws IOException {
this.fileIn = new FileInputStream(fileName);
FileChannel fileChannel = fileIn.getChannel();
this.fileLength = fileChannel.size();
this.number = (int) Math.ceil((double) fileLength / (double) Integer.MAX_VALUE);
this.mappedBufArray = new MappedByteBuffer[number];// 内存文件映射数组
long preLength = 0;
long regionSize = (long) Integer.MAX_VALUE;// 映射区域的大小
for (int i = 0; i < number; i++) {// 将文件的连续区域映射到内存文件映射数组中
if (fileLength - preLength < (long) Integer.MAX_VALUE) {
regionSize = fileLength - preLength;// 最后一片区域的大小
}
mappedBufArray[i] = fileChannel.map(FileChannel.MapMode.READ_ONLY, preLength, regionSize);
preLength += regionSize;// 下一片区域的开始
}
this.arraySize = arraySize;
}
public int read() throws IOException {
if (count >= number) {
return -1;
}
int limit = mappedBufArray[count].limit();
int position = mappedBufArray[count].position();
if (limit - position > arraySize) {
array = new byte[arraySize];
mappedBufArray[count].get(array);
return arraySize;
} else {// 本内存文件映射最后一次读取数据
array = new byte[limit - position];
mappedBufArray[count].get(array);
if (count < number) {
count++;// 转换到下一个内存文件映射
}
return limit - position;
}
}
public void close() throws IOException {
fileIn.close();
array = null;
}
public byte[] getArray() {
return array;
}
public long getFileLength() {
return fileLength;
}
}
}
1.文件字节流----测试代码如下:
import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.IOException;
public class StreamFileReader {
private BufferedInputStream fileIn;
private long fileLength;
private int arraySize;
private byte[] array;
public StreamFileReader(String fileName, int arraySize) throws IOException {
this.fileIn = new BufferedInputStream(new FileInputStream(fileName), arraySize);
this.fileLength = fileIn.available();
this.arraySize = arraySize;
}
public int read() throws IOException {
byte[] tmpArray = new byte[arraySize];
int bytes = fileIn.read(tmpArray);// 暂存到字节数组中
if (bytes != -1) {
array = new byte[bytes];// 字节数组长度为已读取长度
System.arraycopy(tmpArray, 0, array, 0, bytes);// 复制已读取数据
return bytes;
}
return -1;
}
public void close() throws IOException {
fileIn.close();
array = null;
}
public byte[] getArray() {
return array;
}
public long getFileLength() {
return fileLength;
}
public static void main(String[] args) throws IOException {
StreamFileReader reader = new StreamFileReader("/home/zfh/movie.mkv", 65536);
long start = System.nanoTime();
while (reader.read() != -1) ;
long end = System.nanoTime();
reader.close();
System.out.println("StreamFileReader: " + (end - start));
}
}
2.文件通道----测试代码如下:
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
public class ChannelFileReader {
private FileInputStream fileIn;
private ByteBuffer byteBuf;
private long fileLength;
private int arraySize;
private byte[] array;
public ChannelFileReader(String fileName, int arraySize) throws IOException {
this.fileIn = new FileInputStream(fileName);
this.fileLength = fileIn.getChannel().size();
this.arraySize = arraySize;
this.byteBuf = ByteBuffer.allocate(arraySize);
}
public int read() throws IOException {
FileChannel fileChannel = fileIn.getChannel();
int bytes = fileChannel.read(byteBuf);// 读取到ByteBuffer中
if (bytes != -1) {
array = new byte[bytes];// 字节数组长度为已读取长度
byteBuf.flip();
byteBuf.get(array);// 从ByteBuffer中得到字节数组
byteBuf.clear();
return bytes;
}
return -1;
}
public void close() throws IOException {
fileIn.close();
array = null;
}
public byte[] getArray() {
return array;
}
public long getFileLength() {
return fileLength;
}
public static void main(String[] args) throws IOException {
ChannelFileReader reader = new ChannelFileReader("/home/zfh/movie.mkv", 65536);
long start = System.nanoTime();
while (reader.read() != -1) ;
long end = System.nanoTime();
reader.close();
System.out.println("ChannelFileReader: " + (end - start));
}
}
3.内存文件映射----测试代码如下:
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
public class MappedBiggerFileReader {
private MappedByteBuffer[] mappedBufArray;
private int count = 0;
private int number;
private FileInputStream fileIn;
private long fileLength;
private int arraySize;
private byte[] array;
public MappedBiggerFileReader(String fileName, int arraySize) throws IOException {
this.fileIn = new FileInputStream(fileName);
FileChannel fileChannel = fileIn.getChannel();
this.fileLength = fileChannel.size();
this.number = (int) Math.ceil((double) fileLength / (double) Integer.MAX_VALUE);
this.mappedBufArray = new MappedByteBuffer[number];// 内存文件映射数组
long preLength = 0;
long regionSize = (long) Integer.MAX_VALUE;// 映射区域的大小
for (int i = 0; i < number; i++) {// 将文件的连续区域映射到内存文件映射数组中
if (fileLength - preLength < (long) Integer.MAX_VALUE) {
regionSize = fileLength - preLength;// 最后一片区域的大小
}
mappedBufArray[i] = fileChannel.map(FileChannel.MapMode.READ_ONLY, preLength, regionSize);
preLength += regionSize;// 下一片区域的开始
}
this.arraySize = arraySize;
}
public int read() throws IOException {
if (count >= number) {
return -1;
}
int limit = mappedBufArray[count].limit();
int position = mappedBufArray[count].position();
if (limit - position > arraySize) {
array = new byte[arraySize];
mappedBufArray[count].get(array);
return arraySize;
} else {// 本内存文件映射最后一次读取数据
array = new byte[limit - position];
mappedBufArray[count].get(array);
if (count < number) {
count++;// 转换到下一个内存文件映射
}
return limit - position;
}
}
public void close() throws IOException {
fileIn.close();
array = null;
}
public byte[] getArray() {
return array;
}
public long getFileLength() {
return fileLength;
}
public static void main(String[] args) throws IOException {
MappedBiggerFileReader reader = new MappedBiggerFileReader("/home/zfh/movie.mkv", 65536);
long start = System.nanoTime();
while (reader.read() != -1) ;
long end = System.nanoTime();
reader.close();
System.out.println("MappedBiggerFileReader: " + (end - start));
}
}
运行结果比较
用上面三种方法读取1GB文件,运行结果如下
StreamFileReader: 11494900386
ChannelFileReader: 11329346316
MappedFileReader: 11169097480
读取10GB文件,运行结果如下
StreamFileReader: 194579779394
ChannelFileReader: 190430242497
MappedBiggerFileReader: 186923035795
原文链接:https://blog.csdn.net/xiaofeng10330111/article/details/87958174