单词拓扑的结构:
一. bolt部分
准备阶段:生成语句
public class SentenceSpout extends BaseRichSpout {
private SpoutOutputCollector collector;
private String[] sentences = {
"my dog has fleas",
"i like cold beverages",
"the dog ate my homework",
"don't have a cow man",
"i don't think i like fleas"
};
private int index = 0;
//declareOutputFields()所有的Storm组件都必须实现这个接口,Storm的组件通过
// 这个方法告诉Storm该组件会发射哪些数据
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("sentence"));
}
//open方法所有的Spout组件在初始化时调用这个方法
public void open(Map config, //storm配置信息
TopologyContext context, //组件信息
SpoutOutputCollector collector//发射拓扑的方法
) {
this.collector = collector;
}
//是所有spout实现的核心所在,storm通过调用这个方法向
public void nextTuple() {
this.collector.emit(new Values(sentences[index]));
index++;
if (index >= sentences.length) {
index = 0;
}
Utils.waitForMillis(1);
}
}
第一阶:段语句分割:
public class SplitSentenceBolt extends BaseRichBolt{
private OutputCollector collector;
//在bolt初始化时调用,可以用来bolt用到的资源如:连接数据库
public void prepare(Map config, TopologyContext context, OutputCollector collector) {
this.collector = collector;
}
//核心功能,每当从订阅的数据流接受到一个tuple,都会调用这个方法
public void execute(Tuple tuple) {
String sentence = tuple.getStringByField("sentence");
String[] words = sentence.split(" ");
for(String word : words){
this.collector.emit(new Values(word));
}
}
//声明了一个输出流,每个tuple包含一个word字段
//将word作为数据的发射流,topology中的其他bolt就可以订阅这个数据进行下一步处理
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("word"));
}
}
第二阶段:单词计数
public class WordCountBolt extends BaseRichBolt{
private OutputCollector collector;
private HashMap<String, Long> counts = null;
public void prepare(Map config, TopologyContext context,
OutputCollector collector) {
this.collector = collector;
this.counts = new HashMap<String, Long>();
}
public void execute(Tuple tuple) {
String word = tuple.getStringByField("word");
Long count = this.counts.get(word);
if(count == null){
count = 0L;
}
count++;
this.counts.put(word, count);
this.collector.emit(new Values(word, count));
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("word", "count"));
}
}
大部分实例变量通常是在prepare()方法中进行实例化,这个设计模式是由topology的不是方式决定的。当topology发布时,所有的bolt和spout组件首先会进行序列化,然后通过网络发送到集群中。当 topology 发布时,所有的 bolt 和 spout 组件首先会进行序列化,然 后通过网络发送到集群中。如果 spout 或者 bolt 在序列化之前(比如说在构造函数中生成) 实例化了任何无法序列化的实例变量,在进行序列化时会抛出 NotSerializableException 异 常,topology 就会部署失败。
最后一阶段:上报bolt
public class ReportBolt extends BaseRichBolt {
private HashMap<String, Long> counts = null;
public void prepare(Map config, TopologyContext context, OutputCollector collector) {
this.counts = new HashMap<String, Long>();
}
public void execute(Tuple tuple) {
String word = tuple.getStringByField("word");
Long count = tuple.getLongByField("count");
this.counts.put(word, count);
}
//不发射任何数据流,所以是空
public void declareOutputFields(OutputFieldsDeclarer declarer) {
// this bolt does not emit anything
}
//在终止一个bolt之前会调用这个方法,通常情况下cleanup()方法用来释放bolt占用
//的资源,但是当topplogy在storm集群上运行时,cleanup()方法是不可靠的,不能
//保证会执行
@Override
public void cleanup() {
System.out.println("--- FINAL COUNTS ---");
List<String> keys = new ArrayList<String>();
keys.addAll(this.counts.keySet());
Collections.sort(keys);
for (String key : keys) {
System.out.println(key + " : " + this.counts.get(key));
}
System.out.println("--------------");
}
}
二. topology部分
public class WordCountTopology {
private static final String SENTENCE_SPOUT_ID = "sentence-spout";
private static final String SPLIT_BOLT_ID = "split-bolt";
private static final String COUNT_BOLT_ID = "count-bolt";
private static final String REPORT_BOLT_ID = "report-bolt";
private static final String TOPOLOGY_NAME = "word-count-topology";
public static void main(String[] args) throws Exception {
SentenceSpout spout = new SentenceSpout();
SplitSentenceBolt splitBolt = new SplitSentenceBolt();
WordCountBolt countBolt = new WordCountBolt();
ReportBolt reportBolt = new ReportBolt();
TopologyBuilder builder = new TopologyBuilder();
//注册一个sentence spout并赋值给其唯一的id
builder.setSpout(SENTENCE_SPOUT_ID, spout);
// SentenceSpout --> SplitSentenceBolt
builder.setBolt(SPLIT_BOLT_ID, splitBolt)
.shuffleGrouping(SENTENCE_SPOUT_ID);
// SplitSentenceBolt --> WordCountBolt
builder.setBolt(COUNT_BOLT_ID, countBolt)
.fieldsGrouping(SPLIT_BOLT_ID, new Fields("word"));
// WordCountBolt --> ReportBolt
builder.setBolt(REPORT_BOLT_ID, reportBolt)
.globalGrouping(COUNT_BOLT_ID);
Config config = new Config();
LocalCluster cluster = new LocalCluster();
cluster.submitTopology(TOPOLOGY_NAME, config,
builder.createTopology());
waitForSeconds(10);
cluster.killTopology(TOPOLOGY_NAME);
cluster.shutdown();
}
三. Storm的并发机制
Storm 计算支持在多台机器上水平扩容,通过将计算切分 为多个独立的 tasks 在集群上并发执行来实现。在 Storm 中,一个 task 可以简单地理解为 在集群某节点上运行的一个 spout 或者 bolt 实例。
假设 我们有一台服务器(node),为 topology 分配了一个 worker,并且每个 executer 执行一个 task。