词频计算两种方式

这里写自定义目录标题

词频计算两种方式

@RunWith(SpringRunner.class)
@SpringBootTest
public class WordCount {

    final ForkJoinPool pool = ForkJoinPool.commonPool();


    // 并发词频统计
    // time:11507ms
    // total:3646
    // 95676
    @Test
    public void count() throws Exception {

        WordCount counter = new WordCount();
        System.out.println("processors:"+Runtime.getRuntime().availableProcessors());
        counter.run("word",1024*1024*20);
    }

    public void run(String fileName, long chunkSize) throws ExecutionException, InterruptedException {
        File file = new File(fileName);
        long fileSize = file.length();
        long position = 0;

        long t0 = System.currentTimeMillis();
        ArrayList<Future<HashMap<String, Integer>>> tasks = new ArrayList<Future<HashMap<String, Integer>>>();

        while (position < fileSize) {
            long  next = Math.min(position + chunkSize, fileSize);
            CountTask task = new CountTask(fileName, position, next);
            position = next;
            ForkJoinTask<HashMap<String, Integer>> future = pool.submit(task);
            tasks.add(future);
        }

        System.out.format("split to %d tasks\n",tasks.size());
        HashMap<String, Integer> totalMap = new HashMap<>();
        for (Future<HashMap<String, Integer>> future : tasks) {
            HashMap<String, Integer> map = future.get();
            for (Map.Entry<String, Integer> entry : map.entrySet()) {
                incKey(entry.getKey(),totalMap,entry.getValue());
            }
        }

        System.out.println("time:"+(System.currentTimeMillis() - t0)+"ms");
        System.out.println("total:"+totalMap.size());
        System.out.println(totalMap.get("ababb"));

    }

    class CountTask implements Callable<HashMap<String, Integer>> {
        private final long start;
        private final long end;
        private final String fileName;

        public CountTask(String fileName, long start, long end) {
            this.start = start;
            this.end = end;
            this.fileName = fileName;
        }

        @Override
        public HashMap<String, Integer> call() throws Exception {
            HashMap<String, Integer> map = new HashMap<>();
            FileChannel channel = new RandomAccessFile(this.fileName, "rw").getChannel();
            //拿到一部分内存区域
            MappedByteBuffer mbuf = channel.map(
                    FileChannel.MapMode.READ_ONLY, this.start, this.end - this.start
            );
            //解码
            String str = StandardCharsets.US_ASCII.decode(mbuf).toString();
            return countByString(str);
        }
    }


    // 单线程
    // time:34247ms
    // 95574
    // 3905
    @Test
    public void compare_with_single() throws IOException {

        BufferedInputStream in = new BufferedInputStream(new FileInputStream("word"));
        byte[] buf = new byte[4 * 1024];
        int len = 0;
        HashMap<String, Integer> total = new HashMap<String, Integer>();
        long t0 = System.currentTimeMillis();
        while ((len = in.read(buf)) != -1) {
            byte[] bytes = Arrays.copyOfRange(buf, 0, len);
            String str = new String(bytes);
            HashMap<String, Integer> hashMap = countByString(str);
            for (Map.Entry<String, Integer> entry : hashMap.entrySet()) {
                String key = entry.getKey();
                incKey(key, total, entry.getValue());
            }
        }

        System.out.println("time:" + (System.currentTimeMillis() - t0) + "ms");
        System.out.println(total.get("ababb"));
        System.out.println(total.size());
    }

    private HashMap<String, Integer> countByString(String str) {
        HashMap<String, Integer> map = new HashMap<>();
        StringTokenizer tokenizer = new StringTokenizer(str);
        while (tokenizer.hasMoreTokens()) {
            String word = tokenizer.nextToken();
            incKey(word, map, 1);
        }
        return map;
    }

    private void incKey(String key, HashMap<String, Integer> map, int n) {
        if (map.containsKey(key)) {
            map.put(key, map.get(key) + n);
        } else {
            map.put(key, n);
        }
    }
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值