准备
前面一篇已经介绍了最基础的 word_count程序。
现在有一个新的需求,希望每一次spout发送的数据流在发送失败后能够重新发送
编码
SentenceSpout
import backtype.storm.spout.SpoutOutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichSpout;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Values;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicLong;
/**
* Created by chenhong on 16/1/25.
*/
public class SentenceSpout extends BaseRichSpout{
private SpoutOutputCollector collector;
private ConcurrentHashMap<UUID,Values> pending ;
private String[] sentences={
"my dog has fleas",
"i like cold beverages",
"the dog ate my homework",
"don't hava a cow man ",
"i don't think i like fleas"
};
private static AtomicLong count =new AtomicLong();
private long ID=0;
private int index=0;
/*
声明spout会发射一个数据流,其中的tuple包含一个字段sentence
*/
public void declareOutputFields(OutputFieldsDeclarer declarer){
declarer.declare(new Fields("sentence"));
};
/*
spout初始化时调用这个方法
map包含storm配置信息
TopologyContext对象提供了topology中组件的信息
SpoutOutputCollector对象提供了发射tuple的方法
*/
public void open(Map config, TopologyContext context, SpoutOutputCollector collector){
this.ID= count.addAndGet(1);
this.collector=collector;
this.pending = new ConcurrentHashMap<>();
}
/*
Storm通过调用这个方法向输出的collector发射tuple
*/
public void nextTuple(){
Values values = new Values(sentences[index]);
UUID msgId = UUID.randomUUID();
this.pending.put(msgId, values);
this.collector.emit(values,msgId);
index++;
if(index>=sentences.length){
index=0;
}
Utils.waitForMillis(1);
}
@Override
public void ack(Object msgId) {
this.pending.remove(msgId);
}
@Override
public void fail(Object msgId) {
this.collector.emit(this.pending.get(msgId),msgId);
}
}
该SentenceSpout维护了一个ConcurrentHashMap它保存了所有spout发送出去的数据流。当spout确认某一个UUID的数据流被下游成功接受之后,会调用
public void ack(Object msgId) {
this.pending.remove(msgId);
}
msgId表示数据流的UUID,该方法会将发送成功的数据流从ConcurrentHashMap中移除。反之,如果spout认为该UUID发送失败,会调用
public void fail(Object msgId) {
this.collector.emit(this.pending.get(msgId),msgId);
}
该方法会重新发送该数据流。
SplitSentenceBolt
该SplitSentenceBolt用来接收SentenceSpout发送的数据流,如果该Bolt 主动调用
this.collector.ack(tuple);
表示成功接受了该数据流,此时SentenceSpout的public void ack(Object msgId)方法会被调用。
还需要注意一点,如果在SplitSentence通过OutputCollector.emit(oldTuple, newTuple)这样调用来发射tuple(在storm中称之为anchoring),那么在SplitsentenceBolt中ack还不够,还需要在SplitsentenceBolt后面的bolt也ack。
OutputCollector.emit(oldTuple, newTuple)表示newTuple与oldTuple建立了关系(anchored),那么当newTuple发送失败后,spout会重发oldTuple。还不了解可以看文档
http://storm.apache.org/releases/current/Guaranteeing-message-processing.html
完整代码:
import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
import backtype.storm.tuple.Values;
import java.util.Map;
/**
* Created by chenhong on 16/1/25.
*/
public class SplitSentenceBolt extends BaseRichBolt{
private OutputCollector collector;
/*
bolt初始化时调用
*/
public void prepare(Map config ,TopologyContext context,OutputCollector collector){
this.collector = collector;
}
/*
声明每个 tuple包含一个字段 word
*/
public void declareOutputFields(OutputFieldsDeclarer declarer){
declarer.declare(new Fields("word"));
}
/*
每当从订阅的数据流中接收一个tuple,都会调用这个方法
*/
public void execute(Tuple tuple){
String sentence = tuple.getStringByField("sentence");
String[] words = sentence.split(" ");
for(String word : words){
//如果OutputCollector.emit(oldTuple, newTuple)这样调用来发射tuple(在storm中称之为anchoring),
// 那么后面的bolt的ack/fail会影响spout的ack/fail, 如果collector.emit(newTuple)这样来发射tuple(在storm称之为unanchoring),
// 则相当于断开了后面bolt的ack/fail对spout的影响.spout将立即根据当前bolt前面的ack/fail的情况来决定调用spout的ack/fail.
// 所以某个bolt后面的bolt的成功失败对你来说不关心, 你可以直接通过这种方式来忽略
this.collector.emit(tuple,new Values(word));
//this.collector.emit(new Values(word));
}
this.collector.ack(tuple);
}
}
WordCountBolt
wordCountBolt没有变化,如果SplitSentenceBolt也要确保每一条消息都发送成功,可以仿效spout维护一个map。
import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
import backtype.storm.tuple.Values;
import java.util.HashMap;
import java.util.Map;
/**
* Created by chenhong on 16/1/25.
*/
public class WordCountBolt extends BaseRichBolt{
private OutputCollector collector;
private HashMap<String,Long> counts = null;
public void prepare(Map config , TopologyContext context,OutputCollector collector){
this.collector = collector;
this.counts = new HashMap<String, Long>();
}
public void execute(Tuple tuple){
String word = tuple.getStringByField("word");
Long count = this.counts.get(word);
if(count ==null){
count =0L;
}
count++;
this.counts.put(word,count);
this.collector.emit(new Values(word,count));
}
public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
outputFieldsDeclarer.declare(new Fields("word","count"));
}
}
ReportBolt
reportBolt是最后一个bolt,因此不需要声明 field
import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Tuple;
import java.util.*;
/**
* Created by chenhong on 16/1/25.
*/
public class ReportBolt extends BaseRichBolt {
private HashMap<String,Long> counts =null;
public void prepare(Map config, TopologyContext context, OutputCollector collector){
this.counts = new HashMap<String, Long>();
}
public void execute(Tuple tuple){
String word = tuple.getStringByField("word");
Long count = tuple.getLongByField("count");
this.counts.put(word,count);
}
/*
该bolt位于末端,所以declareOutputFields为空
**/
public void declareOutputFields(OutputFieldsDeclarer declarer){
}
/*
cleanup方法用来释放bolt占用的资源
*/
public void cleanup(){
System.out.println("--- FINAL COUNTS ---");
List<String> keys = new ArrayList<String>();
keys.addAll(this.counts.keySet());
Collections.sort(keys);
for(String key: keys){
System.out.println(key+" : "+this.counts.get(key));
}
}
}
WordCountTopology
topology的用法与之前相比多了一些特性,比如当个spout发送的数据量太少,想要设置2个executor去执行该spout,可以通过以下方式,默认一个executor执行一个task任务
builder.setSpout(SENTENCE_SPOUT_ID, spout, 2);
可以设置一个executor执行多个task任务,比如设置bolt运行4个task在2个executor中(每个executor运行2个task)
builder.setBolt(SPILL_BOLT_ID,splitBolt,2)
.setNumTasks(4)
.shuffleGrouping(SENTENCE_SPOUT_ID);
完整代码如下:
import backtype.storm.Config;
import backtype.storm.LocalCluster;
import backtype.storm.StormSubmitter;
import backtype.storm.generated.AlreadyAliveException;
import backtype.storm.generated.InvalidTopologyException;
import backtype.storm.topology.TopologyBuilder;
import backtype.storm.tuple.Fields;
/**
* Created by chenhong on 16/1/25.
*/
public class WordCountTopology {
private static final String SENTENCE_SPOUT_ID="sentence-spout";
private static final String SPILL_BOLT_ID ="split-bolt";
private static final String COUNT_BOLT_ID ="count-bolt";
private static final String REPORT_BOLT_ID="report-bolt";
private static final String TOPOLOGY_NAME="word-count-topology";
public static void main(String[] args) throws AlreadyAliveException, InvalidTopologyException {
SentenceSpout spout = new SentenceSpout();
SplitSentenceBolt splitBolt = new SplitSentenceBolt();
WordCountBolt countBolt = new WordCountBolt();
ReportBolt reportBolt = new ReportBolt();
TopologyBuilder builder = new TopologyBuilder();
//注册一个sentence spout并且赋值给其唯一的ID,2表示需要2个executor来运行该spout
//将2改为1会发现计数结果将近少了一半
builder.setSpout(SENTENCE_SPOUT_ID, spout, 2);
//注册一个splitsentencebolt ,这个bolt订阅sentencespout发射出来的数据流,shuffleGrouping方法告诉
//storm要将类sentenceSpout发射的tuple随机均匀地分发给SplitSentenceBolt实例,此处的2表示2个executor
//setNumTasks设置了4个task,因此一个executor运行2个task
builder.setBolt(SPILL_BOLT_ID,splitBolt,2)
.setNumTasks(4)
.shuffleGrouping(SENTENCE_SPOUT_ID);
//fieldsGrouping()方法来保证所有 word字段值相同的tuple会被路由到同一个wordcountbolt实例中
builder.setBolt(COUNT_BOLT_ID, countBolt, 4)
.fieldsGrouping(SPILL_BOLT_ID, new Fields("word"));
//globalGrouping方法将WordCountBolt发射的所有tuple路由到唯一的ReportBolt任务中
builder.setBolt(REPORT_BOLT_ID,reportBolt).globalGrouping(COUNT_BOLT_ID);
//config对象代表了对topology所有组件全局生效的配置参数集合,会分发给各个spout和bolt的open(),prepare()方法
Config config = new Config();
config.setNumWorkers(2);
//使用命令行参数来决定使用 本地/远程集群模式
if(args.length==0) {
//LocalCluster类在本地开发环境来模拟一个完整的storm集群
LocalCluster cluster = new LocalCluster();
cluster.submitTopology(TOPOLOGY_NAME, config, builder.createTopology());
Utils.waitForSeconds(10);
cluster.killTopology(TOPOLOGY_NAME);
cluster.shutdown();
}else{
StormSubmitter.submitTopology(args[0],config,builder.createTopology() );
}
}
}
运行结果
--- FINAL COUNTS ---
a : 2688
ate : 2688
beverages : 2689
cold : 2689
cow : 2688
dog : 5377
don't : 5375
fleas : 5377
has : 2689
hava : 2688
homework : 2688
i : 8063
like : 5376
man : 2688
my : 5377
the : 2688
think : 2687