词频统计-Stream-flatmap的理解-chm

吃菜像吃草

已于 2023-04-16 12:19:19 修改

阅读量55

点赞数 1

文章标签： java 大数据

于 2023-03-27 00:15:57 首次发布

本文链接：https://blog.csdn.net/zsdxph/article/details/129787119

版权

有可以改进的地方请多多指教

第一版

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.*;
import java.util.stream.Collectors;

public class Test {
    public static void main(String[] args) throws IOException {
        Map<String, Integer> map = new HashMap<>();
        Files.lines(Paths.get("F:\\IdeaProjects\\OnJava\\src\\alice.txt"))
                .filter(line -> line.length() != 0)
                .flatMap(line -> Arrays.stream(line.split("\\W+")))
                .filter(s -> s.length() != 0)
                .forEach(s -> map.merge(s, 1, Integer::sum));
        List<String> collect = map.entrySet().stream()
                .sorted((i, j) -> j.getValue().compareTo(i.getValue()))
//                .forEach(i -> System.out.println(i.getKey() + ": " + i.getValue()));
                .map(i -> i.getKey() + ": " + i.getValue())
                .collect(Collectors.toList());
        Files.write(Paths.get("F:\\IdeaProjects\\OnJava\\src\\alice-result.txt"),
                collect, StandardCharsets.UTF_8);
    }
}

最终结果：

在这里插入图片描述

Alice’s Adventures in Wonderland原版链接：
https://www.gutenberg.org/files/11/

第二版

今天读到《Effective Java》，发现有以下几点可以改进：

forEach是终止操作中最没威力的，也是对Stream最不友好的。它是显式迭代，因而不适合并行。它只用于报告Stream计算的结果，而不是执行计算
静态导入Collectors包，可读性
使用try-with-resource
试试并行流

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.*;
import static java.util.stream.Collectors.*;

import java.util.regex.Pattern;
import java.util.stream.Stream;
public class Test {
    public static void main(String[] args) throws IOException {
        List<String> collect;
        try (Stream<String> stream
                     = Files.lines(Paths.get("F:\\IdeaProjects\\OnJava\\src\\alice.txt"))) {
            collect = stream
                    .parallel()
                    .filter(line -> line.length() != 0)
                    .flatMap(line -> Pattern.compile("\\W+").splitAsStream(line))
                    .filter(s -> s.length() != 0)
                    .collect(groupingBy(String::toLowerCase, counting()))
                    .entrySet().stream()
                    .sorted((n1, n2) -> n2.getValue().compareTo(n1.getValue()))
                    .map(entry -> entry.getKey() + ": " + entry.getValue())
                    .collect(toList());
        }

        Files.write(Paths.get("F:\\IdeaProjects\\OnJava\\src\\alice-result.txt"), collect, StandardCharsets.UTF_8);
    }
}

在这里插入图片描述

总结：

编写Stream pipeline本质是无副作用的函数对象（函数副作用指当调用函数时，除了返回函数值之外，还对主调用函数产生附加的影响。例如修改全局变量（函数外的变量）或修改参数。）
forEach应该只报告Stream执行的计算结果，而不是让它执行计算
必须了解收集器。最重要的收集器工厂是toList、toSet、toMap、groupingBy和joining

关于groupingBy：

https://blog.csdn.net/zhouzhiwengang/article/details/112319054

第三版

在一个流中完成读取、处理、输出

import java.io.IOException;
import java.io.PrintWriter;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.*;
import static java.util.stream.Collectors.*;

import java.util.regex.Pattern;
import java.util.stream.Stream;
public class Test {
    public static void main(String[] args) throws IOException {
        List<String> collect;
        try (Stream<String> stream = Files.lines(Paths.get("src/alice.txt"));
             PrintWriter pWriter = new PrintWriter("src/alice-result.txt")) {
            stream
                    .parallel()
                    .filter(line -> line.length() != 0)
                    .flatMap(line -> Pattern.compile("\\W+").splitAsStream(line))
                    .filter(s -> s.length() != 0)
                    .collect(groupingBy(String::toLowerCase, counting()))
                    .entrySet().stream()
                    .sorted((n1, n2) -> n2.getValue().compareTo(n1.getValue()))
                    .map(entry -> entry.getKey() + ": " + entry.getValue())
                    .forEach(pWriter::println);
        }
    }
}

new PrintWriter()的返回类型

关于扁平化的理解：

https://leetcode.cn/problems/Qv1Da2/?favorite=e8X3pBZi
把这个链表看成是树，child是左子树，next是右子树，扁平化其实就是按序列打印树的节点，按规则是root、child、next的顺序，也就是树的先序遍历，把所有节点存在list中，最后把next和prev连接起来，把child节点设为null就完成了

在这里插入图片描述

/*
// Definition for a Node.
class Node {
    public int val;
    public Node prev;
    public Node next;
    public Node child;
};
*/

class Solution {
    List<Node> list = new ArrayList<>();
    public Node flatten(Node head) {
        if (head == null) return null;
        dfs(head);
        for (int i = 0; i < list.size() - 1; i++) {
            list.get(i).next = list.get(i + 1);
            list.get(i + 1).prev = list.get(i);
        }
        return list.get(0);
    }

    private void dfs(Node head) {
        if (head == null) return;
        list.add(head);
        dfs(head.child);
        dfs(head.next);
        head.child = null;
    }
}

ConcurrentHashMap版本：

import java.io.*;
import java.nio.file.*;
import java.util.*;
import java.util.concurrent.*;
import java.util.stream.*;

/**
 * This program demonstrates concurrent hash maps.
 *
 * @author Cay Horstmann
 * @version 1.0 2018-01-04
 */
public class CHMDemo {
    // 线程安全的集合
    public static ConcurrentHashMap<String, Long> map = new ConcurrentHashMap<>();

    /**
     * Adds all words in the given file to the concurrent hash map.
     *
     * @param file a file
     */
    public static void process(Path file) {
        try (var in = new Scanner(file)) {

            while (in.hasNext()) {
                String word = in.next();
                // 映射条目原子更新
                map.merge(word, 1L, Long::sum);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    /**
     * Returns all descendants of a given directory--see Chapters 1 and 2 of Volume II
     *
     * @param rootDir the root directory
     * @return a set of all descendants of the root directory
     */
    public static Set<Path> descendants(Path rootDir) throws IOException {
        try (Stream<Path> entries = Files.walk(rootDir)) {
            return entries.collect(Collectors.toSet());
        }
    }

    public static void main(String[] args)
            throws InterruptedException, ExecutionException, IOException {
        //获取处理器的核心数
        int processors = Runtime.getRuntime().availableProcessors();
        ExecutorService executor = Executors.newFixedThreadPool(processors);
        Path pathToRoot = Path.of("src");
        for (Path p : descendants(pathToRoot)) {
            if (p.getFileName().toString().endsWith(".java"))
                executor.execute(() -> process(p));
        }
        executor.shutdown();
        executor.awaitTermination(10, TimeUnit.MINUTES);
        map.forEach((k, v) ->
        {
            if (v >= 10)
                System.out.println(k + " occurs " + v + " times");
        });

        //并发映射批操作。有个阈值，如果映射的元素超过它，将并行完成批操作
        //Long.MAX_VALUE是阈值的话，批操作在一个线程中执行
        //result 设置为第一个出现次数超过1000的单词，如果匹配不到则返回null
        String result = map.search(10, (k, v) -> v > 1000 ? k : null);
        System.out.println(result + " ：" + map.get(result));
        // forEach 有两种形式：一，直接一个consumer。二、多一个转换器函数
        map.forEach(1000,
                (k, v) -> k + "->" + v, //transformer
                System.out::println );//consumer
        map.forEach(100,
                (k,v) -> v > 1000 ? k + "->" + v : null,
                System.out::println); // the nulls are not passed to the consumer
        //归约操作，累加函数组合其输入
        Long count = map.reduceValues(100,
                v -> v > 100 ? 1L : null,
                Long::sum);
        System.out.println(count);

    }
}