Storm 的 WordCount 案例

最新推荐文章于 2024-08-23 15:54:50 发布
@Anges
最新推荐文章于 2024-08-23 15:54:50 发布
阅读量593
点赞数 1
文章标签： storm java
本文链接：https://blog.csdn.net/m0_51550513/article/details/130412148
版权
import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.StormSubmitter;
import org.apache.storm.generated.StormTopology;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import org.apache.storm.utils.Utils;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.util.HashMap;
import java.util.Map;

public class WordCountTopology {
    public static void main(String[] args) throws Exception {
        //throws Exception捕获异常声明

        //****请根据提示补全WordCountTopology主程序的创建过程****//
        /*********begin*********/
        //主程序：创建一个名为builder的Topology的任务 
        TopologyBuilder builder = new TopologyBuilder();
        //指定Topology任务的spout组件(不设置并行度)
        builder.setSpout("wordcount_spout", new WordCountSpout());
        //指定Topology任务的第一个Bolt组件：分词(不设置并行度)
        builder.setBolt("wordcount_split_bolt", new WordCountSplitBolt()).shuffleGrouping("wordcount_spout");
        //指定Topology任务的第二个Bolt组件：计数(不设置并行度)
        builder.setBolt("wordcount_count_bolt", new WordCountTotalBolt()).fieldsGrouping("wordcount_split_bolt",new Fields("word"));

        /*********end**********/
        //创建Topology任务
        StormTopology wc = builder.createTopology();
        //配置参数
        Config conf = new Config();
        //执行任务
        //方式1：本地模式
        LocalCluster localCluster = new LocalCluster();
        localCluster.submitTopology("MyStormWordCount", conf, wc);
        //方式2：集群模式
        //使用 StormSubmitter 将 topology 提交到集群. StormSubmitter 以 topology 的名称, topology 的配置和 topology 本身作为输入
        // StormSubmitter.submitTopology(args[0], conf, wc);
    }
    //采集数据spout组件
    public static class WordCountSpout extends BaseRichSpout {
        // 模拟产生一些数据
        private String[] data = {
                "Apache Storm is a free and open source distributed realtime computation system.Apache Storm makes it easy to reliably process unbounded streams of data,doing for realtime processing what Hadoop did for batch processing."
        };
        // 定义spout的输出流
        private SpoutOutputCollector collector;

        @Override
        public void nextTuple() {
        // 由storm框架调用，每次调用进行数据采集
        // 打印打印采集到的数据
            System.out.println("采集的数据是：" + data[0]);

            /*********begin*********/
            // 将采集到的数据发送给下一个组件进行处理
            this.collector.emit(new Values(data[0]));

            /*********end**********/

            // 设置为隔很长时间才执行一次下采集操作
            Utils.sleep(300000000);
        }

        @Override
        public void open(Map arg0, TopologyContext arg1, SpoutOutputCollector collector) {
            // 初始化spout组件时调用
            this.collector = collector;
        }

        @Override
        public void declareOutputFields(OutputFieldsDeclarer declarer) {
            // 声明输出的Tuple的格式
            declarer.declare(new Fields("sentence"));
        }
    }
    public static class WordCountSplitBolt extends BaseRichBolt {
        private OutputCollector collector;
        @Override
        public void execute(Tuple tuple) {
            // 处理上一个组件发来的数据
            String str = tuple.getStringByField("sentence");
            //str.trim()用于去除字符串两端的空白字符
            str = str.trim();
            //replace(char oldChar,char newChar);
            //   oldChar：要替换的子字符串或者字符。
            //   newChar：新的字符串或字符，用于替换原有字符串的内容。
            str = str.replace(",", " ");
            str = str.replace(".", " ");
            str = str.trim();
            // 分词操作
            String[] words = str.split(" ");

//words:[Apache, Storm, is, a, free, and, open, source, distributed, realtime, computation, system, Apache, Storm, makes, it, easy, to, reliably, process, unbounded, streams, of, data, doing, for, realtime, processing, what, Hadoop, did, for, batch, processing]

/*
word:
[Apache, 1]
[Storm, 1]
[is, 1]
[a, 1]
[free, 1]
[and, 1]
[open, 1]
[source, 1]
[distributed, 1]
[realtime, 1]
[computation, 1]
[system, 1]
[Apache, 1]
[Storm, 1]
[makes, 1]
[it, 1]
[easy, 1]
[to, 1]
[reliably, 1]
[process, 1]
[unbounded, 1]
[streams, 1]
[of, 1]
[data, 1]
[doing, 1]
[for, 1]
[realtime, 1]
[processing, 1]
[what, 1]
[Hadoop, 1]
[did, 1]
[for, 1]
[batch, 1]
[processing, 1]
*/

            /*********begin*********/
            // 将处理好的(word,1)形式的数据发送给下一个组件
            for (String w : words) {
                this.collector.emit(new Values(w, 1));
            }
            /*********end**********/
        }
        @Override
        public void prepare(Map arg0, TopologyContext arg1, OutputCollector collector) {
        // 初始化时调用
        // OutputCollector代表的就是这个bolt组件的输出流
            this.collector = collector;
        }
        @Override
        public void declareOutputFields(OutputFieldsDeclarer declarer) {
            // 声明这个Bolt组件输出Tuple的格式
            declarer.declare(new Fields("word", "count"));
        }
    }
    //统计单词个数
    public static class WordCountTotalBolt extends BaseRichBolt {
        private OutputCollector collector;
        // 定义一个集合来保存单词计数结果
        private Map<String, Integer> result = new HashMap<>();

        @Override
        public void execute(Tuple tuple) {
            // 取出Tuple中的数据
            String word = tuple.getStringByField("word");
            int count = tuple.getIntegerByField("count");

            /*********begin*********/
            // 统计单词出现的个数，存入result集合中
            if (result.containsKey(word)) {
                int total = result.get(word);
                result.put(word, total + count);
            } else {
                result.put(word, count);
            }
            /*********end**********/
            // 打印
            System.out.println("result" + result);
            String a="result" + result;
            try {
                //新建一个文件a.txt
                File file = new File("a.txt");
                PrintStream ps = new PrintStream(new FileOutputStream(file));
                //往文件里写入字符串a
                ps.append(a);
            } catch (FileNotFoundException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            // 输出数据
            this.collector.emit(new Values(word, result.get(word)));
        }
        @Override
        public void prepare(Map arg0, TopologyContext arg1, OutputCollector collector) {
            // 初始化时调用
            this.collector = collector;
        }
        @Override
        public void declareOutputFields(OutputFieldsDeclarer declarer) {
            // 声明输出数据的格式
            declarer.declare(new Fields("word", "count"));
        }
    }
}