目标:使用单线程与多线程分别进行词频计算
随机生成数据文件代码:
@Test
public void gen() throws IOException {
var ramdom = new Random();
var fileName = new File("Test");
var fileStream = new FileOutputStream(fileName);
var bufferSize = 4*1024;
var fileStreamBuffer = new BufferedOutputStream(fileStream, bufferSize);
var startTime = System.currentTimeMillis();
for (int i = 0; i < 10000000; i++){
for (int j = 0; j < 5; j++) {
fileStreamBuffer.write(97 + ramdom.nextInt(5));
}
fileStreamBuffer.write(' ');
}
fileStreamBuffer.flush();
fileStreamBuffer.close();
System.out.println(System.currentTimeMillis() - startTime);
}
单线程代码:
package com.miracle.study.wordcount;
import org.junit.jupiter.api.Test;
import java.io.*;
import java.nio.ByteBuffer;
import java.util.HashMap;
import java.util.StringTokenizer;
/**
* @author Miracle
* @date 2021/4/3 19:26
*/
public class WorldCount {
@Test
public void countFileWorld() throws IOException {
var inFileStream = new FileInputStream("test");
var buf = new byte[1024*8];
try(var inBuffer = new BufferedInputStream(inFileStream, 1024*8)){
var map = new HashMap<String, Integer>();
int len;
while (inBuffer.read(buf) != -1){
String testString = new String(buf);
StringTokenizer stringTokenizer = new StringTokenizer(testString);
while (stringTokenizer.hasMoreTokens()){
var word = stringTokenizer.nextToken();
if (map.containsKey(word)){
map.put(word,map.get(word) + 1);
}else {
map.put(word, 1);
}
}
}
System.out.println(map.get("dabac"));
}
}
/**
* NIO模式,效果与上面一致
* @throws IOException
*/
@Test
public void nioCountFileWorld() throws IOException {
var buf = ByteBuffer.allocate(1024*8);
var map = new HashMap<String, Integer>();
try(var inFileChannel = new FileInputStream("test").getChannel()){
while (inFileChannel.read(buf) != -1){
buf.flip();
String testString = new String(buf.array());
StringTokenizer stringTokenizer = new StringTokenizer(testString);
while (stringTokenizer.hasMoreTokens()){
var word = stringTokenizer.nextToken();
if (map.containsKey(word)){
map.put(word,map.get(word) + 1);
}else {
map.put(word, 1);
}
}
buf.clear();
}
inFileChannel.close();
System.out.println(map.get("dabac"));
}
}
}
多线程模式
知识点
- callable:创建多线程类,需要实现call类,可以用future异步接收返回数据。
- RandomAccessFile:支持指定位置读,指定位置写。
- ForkJoinPool:工作窃取算法,分配了与线程数相等的队列,任务平均分配到对应的队列上,并且线程完成所有任务时会从其他队列末端窃取任务执行。
callable任务:
package com.miracle.study.wordcount;
import java.io.RandomAccessFile;
import java.nio.channels.FileChannel;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.StringTokenizer;
import java.util.concurrent.Callable;
/**
* @author Miracle
* @date 2021/4/3 22:26
*/
public class MulCountTask implements Callable<HashMap<String, Integer>> {
/**
* 文件名称
*/
private String fileName;
/**
* 文件开始位置
*/
private long start;
/**
* 文件结束位置
*/
private long end;
public MulCountTask(String fileName, long start, long end){
this.fileName = fileName;
this.start = start;
this.end = end;
}
@Override
public HashMap<String, Integer> call() throws Exception {
try(var accessFile = new RandomAccessFile(this.fileName, "rw").getChannel()){
// 指定位置读取文件数据
var mbuf = accessFile.map(FileChannel.MapMode.READ_ONLY, this.start, this.end - this.start);
// 二进制转码,因为我的文件全部采用英文,因此使用ASCII
var str = StandardCharsets.US_ASCII.decode(mbuf).toString();
// 下面是统计词频流程
var stringTokenizer = new StringTokenizer(str);
var map = new HashMap<String, Integer>();
while (stringTokenizer.hasMoreTokens()){
var word = stringTokenizer.nextToken();
if (map.containsKey(word)){
map.put(word, map.get(word) + 1);
}else {
map.put(word, 1);
}
}
return map;
}
}
}
主线程执行
package com.miracle.study.wordcount;
import org.junit.jupiter.api.Test;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.Future;
/**
* @author Miracle
* @date 2021/4/3 22:08
*/
public class MulWorldCount {
private final ForkJoinPool pool = ForkJoinPool.commonPool();
@Test
public void mulCount() throws ExecutionException, InterruptedException {
var file = new File("test");
// 总文件大小
var fileSize = file.length();
// 初始位置
var position = 0L;
// 每次读取的文件大小量
var chumSize = 1024*1024*30;
// 创建结果集合
var tasks = new ArrayList<Future<HashMap<String, Integer>>>();
while (fileSize > position){
// 计算每个任务的任务量size
var next = Math.min(fileSize, position + chumSize);
// 创建任务
var task = new MulCountTask(file.getName(), position, next);
// 移动位置到当前任务的结束任务量
position = next;
// 提交任务
var taskResult = pool.submit(task);
// 收集结果
tasks.add(taskResult);
}
var totalMap = new HashMap<String, Integer>();
// 提取结果并统计词频
for (var task : tasks){
var map = task.get();
for(var entry : map.entrySet()) {
if (totalMap.containsKey(entry.getKey())){
totalMap.put(entry.getKey(), totalMap.get(entry.getKey()) + entry.getValue());
}else {
totalMap.put(entry.getKey(), entry.getValue());
}
}
}
System.out.println(totalMap.get("dabac"));
}
}