1、功能说明
设计一个topology,来实现对文档里面的单词出现的频率进行统计。
整个topology分为三个部分:
- RandomSentenceSpout:数据源,在已知的英文句子中,随机发送一条句子出去。
- SplitSentenceBolt:负责将单行文本记录(句子)切分成单词
- WordCountBolt:负责对单词的频率进行累加
2、Storm程序驱动类
public class WordCountTopology {
public static void main(String[] args) throws Exception {
System.out.println("开始构造topology");
// 组装topology
TopologyBuilder builder = new TopologyBuilder();
String spoutId = "s_words";
String splitBoltId = "b_split";
String countBoltId= "b_count";
// 添加spout:spoutid,spout实例,执行spout的executor数量
builder.setSpout(spoutId, new RandomSentenceSpout(), 2);
// 添加bolt:boltid,bolt实例,执行bolt的executor数量,上游的spoutid及路由方式
builder.setBolt(splitBoltId, new SplitSentenceBolt(), 4).shuffleGrouping(spoutId);
// 添加bolt:boltid,bolt实例,执行bolt的executor数量,上游的spoutid及路由方式
builder.setBolt(countBoltId, new CountWordBolt(), 4).fieldsGrouping(splitBoltId, new Fields("word"));
// 设置运行参数
Config conf = new Config();
conf.setDebug(true);
// worker 数量
conf.setNumWorkers(3);
// 提交topology
String topologyName = "word-count";
StormSubmitter.submitTopology(topologyName, conf, builder.createTopology());
System.out.println("提交topology");
}
}
3、RandomSentenceSpout的实现及生命周期
public class RandomSentenceSpout extends BaseRichSpout {
private static final long serialVersionUID = -305466827631750450L;
SpoutOutputCollector _collector;
Random _rand;
@Override
public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) {
_collector = collector;
_rand = new Random();
}
@Override
public void nextTuple() {
Utils.sleep(100);
String[] sentences = new String[] { sentence("the cow jumped over the moon"),
sentence("an apple a day keeps the doctor away"), sentence("four score and seven years ago"),
sentence("snow white and the seven dwarfs"), sentence("i am at two with nature") };
final String sentence = sentences[_rand.nextInt(sentences.length)];
System.out.println("发出 tuple: " + sentence);
_collector.emit(new Values(sentence));
}
protected String sentence(String input) {
return input;
}
@Override
public void ack(Object id) {
// 不做ack保证
}
@Override
public void fail(Object id) {
// 不做ack保证
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("word"));
}
}
4、SplitSentenceBolt的实现及生命周期
public class SplitSentenceBolt extends BaseRichBolt {
private static final long serialVersionUID = -4052979219004386147L;
private OutputCollector _collector;
@Override
public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
_collector = collector;
}
@Override
public void execute(Tuple input) {
System.out.println("收到 tuple: " + input);
String[] words = input.getValue(0).toString().split(" ");
for(String word : words) {
List<Object> outputTuple = new Values(word);
System.out.println("发出 tuple: " + outputTuple);
_collector.emit(input, outputTuple);
}
}
/**
* 设定发出的tuple的数据字段,需要与emit的tuple一致
*/
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("word"));
}
}
5、WordCountBolt的实现及生命周期
public class WordCountBolt extends BaseBasicBolt {
private static final long serialVersionUID = 5406465393295822560L;
Map<String, Integer> counts = new HashMap<String, Integer>();
@Override
public void execute(Tuple tuple, BasicOutputCollector collector) {
String word = tuple.getString(0);
Integer count = counts.get(word);
if (count == null) {
count = 0;
}
count++;
counts.put(word, count);
System.out.println("word count: " + word + ":" + count);
}
/**
* 设定发出的tuple的数据字段,需要与emit的tuple一致
*/
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
// 不需要发出tuple
}
/**
* 在任务结束(kill)时调用:打印最终结果
*/
@Override
public void cleanup() {
System.out.println("-----------------最终结果开始-----------------------");
List<String> keys = new ArrayList<String>();
keys.addAll(this.counts.keySet());
Collections.sort(keys);
for (String key : keys) {
System.out.println(key + " : " + this.counts.get(key));
}
System.out.println("-----------------最终结果结束-----------------------");
}
}