package org.apache.storm;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Values;
import org.apache.storm.utils.Utils;
import java.util.Map;
import java.util.Random;
//定义一个随机句子发射组件RandomSentenceSpout,它继承于BaseRichSpout
public class RandomSentenceSpout extends BaseRichSpout
{
//创建一个消息发射器(输出集合)
SpoutOutputCollector _collector;
//创建一个随机数发生器
Random _rand;
//重写第一个方法open,里面接收了三个参数,第一个是创建Topology时的配置,第二个是所有的Topology数据,包括拓扑中该任务的位置信息、任务ID等,第三个数据发射集合是用来把Spout的数据发射给bolt。
public void open(Map conf, TopologyContext context, SpoutOutputCollector collector)
{
//初始化发射器
_collector = collector;
//初始化随机数发生器
_rand = new Random();
}
//接下来重写Spout最主要的方法nextTuple,
public void nextTuple()
{
// 这个方法会不断被调用,为了降低它对CPU的消耗,当任务完成时让它sleep一下
Utils.sleep(100);
//接下来定义一个字符串数组,随意写几个句子让它发射
String[] sentences = new String[]{ "the cow jumped over the moon", "an apple a day keeps the doctor away",
"four score and seven years ago", "snow white and the seven dwarfs", "i am at two with nature" };
String sentence = sentences[_rand.nextInt(sentences.length)]; //每次随机挑选句子给sentences用来发射给下一个组件
_collector.emit(new Values(sentence)); //发射
}
/*这个函数是Storm来确定具有id标识符的此喷口发出的元组已经被完全处理。*/
public void ack(Object id)
{
System.out.println("ok:"+id);
}
//由id标识符的此喷口发出的元组未能完全处理。
public void fail(Object id)
{
System.out.println("fail:"+id);
}
public void declareOutputFields(OutputFieldsDeclarer declarer)
{
declarer.declare(new Fields("word")); //声明输出字段word
}
}
//至此,消息发射组件定义完了,接下来是定义句子分割组件和单词计数组件。
import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.StormSubmitter;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.ShellBolt;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.BasicOutputCollector;
import org.apache.storm.topology.IRichBolt;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.topology.base.BaseBasicBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class WordCountTopology
{
public static class SplitSentence extends ShellBolt implements IRichBolt
{
OutputCollector _collector; //创建消息发射器(输出集合)
public void prepare(Map conf, TopologyContext context, OutputCollector collector)
{
_collector = collector; //初始化
}
//我们要在execute函数里实现把句子切成一个个单词的功能
public void execute(Tuple input)
{
//首先定义一个字符串存放输入的句子
String sentence=input.getString(0);
//调用切割函数,以空格为切割标识
String[] words = sentence.split("");
for(String word : words)
{
//返回一个字符串,其值是此字符串,并删除任何前导和尾随空格
word=word.trim();
if(!word.isEmpty())
{
//将所有字符转换成小写
word=word.toLowerCase();
List a = new ArrayList();
a.add(input);
//切完后把单词发射到下一个Bolt进行计数
_collector.emit(a, new Values(word));
}
}
_collector.ack(input); //确认成功处理一个tuple
}
public void declareOutputFields(OutputFieldsDeclarer declarer)
{
declarer.declare(new Fields("word")); //声明字段
}
//这个函数是声明该组件特定的配置
public Map<String, Object> getComponentConfiguration()
{
return null;
}
}
//接下来是接受上一个Bolt发来的单词然后开始计数
public static class WordCount extends BaseBasicBolt
{
//首先创建一个Map集合存放单词和它对应的出现次数
Map<String, Integer> counts = new HashMap<String, Integer>();
//接下来开始重写execute函数进行计数
public void execute(Tuple tuple, BasicOutputCollector collector)
{
String word = tuple.getString(0); //创建一个字符串存放单词
Integer count = counts.get(word); //创建一个整数count计数
if (count == null)
count = 0;
count++;
counts.put(word, count); //把单词和对应的计数放到Map集合中
collector.emit(new Values(word, count));
}
public void declareOutputFields(OutputFieldsDeclarer declarer)
{
//声明字段,用于传输识别
declarer.declare(new Fields("word", "count"));
}
}
//接下来就是构建拓扑提交器和提交拓扑了,和上个例子一样
public static void main(String[] args) throws Exception
{
//创建拓扑提交器
TopologyBuilder builder = new TopologyBuilder();
//设置Spout组件,取名spout,并行度设置为5,分配五个线程来执行它
builder.setSpout("spout", new RandomSentenceSpout(), 5);
//设置Bolt组件,取名split,接收从spout传来的数消息,消息流分组方式为随机分组,最后分配八个线程来执行它
builder.setBolt("split", new SplitSentence(), 8).shuffleGrouping("spout");
//设置第二个计数Bolt组件,取名count,接收从spout传来的数消息,消息流分组方式为按字段分组,最后分配12个线程来执行它
builder.setBolt("count", new WordCount(), 12).fieldsGrouping("split", new Fields("word"));
Config conf = new Config(); //获取配置信息并设置为自动调整模式
conf.setDebug(true);
if (args != null && args.length > 0)
{
conf.setNumWorkers(3);
StormSubmitter.submitTopologyWithProgressBar(args[0], conf, builder.createTopology());
}
else {
conf.setMaxTaskParallelism(3);
LocalCluster cluster = new LocalCluster();
cluster.submitTopology("word-count", conf, builder.createTopology());
Thread.sleep(10000);
cluster.shutdown();
}
}
}
Storm WordCount源码解析
最新推荐文章于 2019-09-01 22:42:52 发布