词频计算两种方式
@RunWith(SpringRunner.class)
@SpringBootTest
public class WordCount {
final ForkJoinPool pool = ForkJoinPool.commonPool();
// 并发词频统计
// time:11507ms
// total:3646
// 95676
@Test
public void count() throws Exception {
WordCount counter = new WordCount();
System.out.println("processors:"+Runtime.getRuntime().availableProcessors());
counter.run("word",1024*1024*20);
}
public void run(String fileName, long chunkSize) throws ExecutionException, InterruptedException {
File file = new File(fileName);
long fileSize = file.length();
long position = 0;
long t0 = System.currentTimeMillis();
ArrayList<Future<HashMap<String, Integer>>> tasks = new ArrayList<Future<HashMap<String, Integer>>>();
while (position < fileSize) {
long next = Math.min(position + chunkSize, fileSize);
CountTask task = new CountTask(fileName, position, next);
position = next;
ForkJoinTask<HashMap<String, Integer>> future = pool.submit(task);
tasks.add(future);
}
System.out.format("split to %d tasks\n",tasks.size());
HashMap<String, Integer> totalMap = new HashMap<>();
for (Future<HashMap<String, Integer>> future : tasks) {
HashMap<String, Integer> map = future.get();
for (Map.Entry<String, Integer> entry : map.entrySet()) {
incKey(entry.getKey(),totalMap,entry.getValue());
}
}
System.out.println("time:"+(System.currentTimeMillis() - t0)+"ms");
System.out.println("total:"+totalMap.size());
System.out.println(totalMap.get("ababb"));
}
class CountTask implements Callable<HashMap<String, Integer>> {
private final long start;
private final long end;
private final String fileName;
public CountTask(String fileName, long start, long end) {
this.start = start;
this.end = end;
this.fileName = fileName;
}
@Override
public HashMap<String, Integer> call() throws Exception {
HashMap<String, Integer> map = new HashMap<>();
FileChannel channel = new RandomAccessFile(this.fileName, "rw").getChannel();
//拿到一部分内存区域
MappedByteBuffer mbuf = channel.map(
FileChannel.MapMode.READ_ONLY, this.start, this.end - this.start
);
//解码
String str = StandardCharsets.US_ASCII.decode(mbuf).toString();
return countByString(str);
}
}
// 单线程
// time:34247ms
// 95574
// 3905
@Test
public void compare_with_single() throws IOException {
BufferedInputStream in = new BufferedInputStream(new FileInputStream("word"));
byte[] buf = new byte[4 * 1024];
int len = 0;
HashMap<String, Integer> total = new HashMap<String, Integer>();
long t0 = System.currentTimeMillis();
while ((len = in.read(buf)) != -1) {
byte[] bytes = Arrays.copyOfRange(buf, 0, len);
String str = new String(bytes);
HashMap<String, Integer> hashMap = countByString(str);
for (Map.Entry<String, Integer> entry : hashMap.entrySet()) {
String key = entry.getKey();
incKey(key, total, entry.getValue());
}
}
System.out.println("time:" + (System.currentTimeMillis() - t0) + "ms");
System.out.println(total.get("ababb"));
System.out.println(total.size());
}
private HashMap<String, Integer> countByString(String str) {
HashMap<String, Integer> map = new HashMap<>();
StringTokenizer tokenizer = new StringTokenizer(str);
while (tokenizer.hasMoreTokens()) {
String word = tokenizer.nextToken();
incKey(word, map, 1);
}
return map;
}
private void incKey(String key, HashMap<String, Integer> map, int n) {
if (map.containsKey(key)) {
map.put(key, map.get(key) + n);
} else {
map.put(key, n);
}
}
}