- 流程图如下:
2. 编写SentenceSpout
package com.ibeifeng.bigdata.storm.topo;
import backtype.storm.spout.SpoutOutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.IRichSpout;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichSpout;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Values;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Map;
import java.util.Random;
/**
* Spout开发
* Created by ad on 2016/12/11.
*/
//public class SentenceSpout implements IRichSpout{
public class SentenceSpout extends BaseRichSpout{
private static final Logger logger = LoggerFactory.getLogger(SentenceSpout.class);
/**
* tuple发射器
*/
private SpoutOutputCollector collector;
private static final String[] SENTENCES = {
"hadoop yarn mapreduce spark",
"flume hadoop hive spark",
"oozie yarn spark storm",
"storm yarn mapreduce error",
"error flume storm spark"
};
/**
* 用来声明该组件向后面组件发射的tuple的key名称依次是什么
* @param declarer
*/
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("sentence"));
}
/**
* 用于指定只针对本组件的一些特殊配置
* @return
*/
@Override
public Map<String, Object> getComponentConfiguration() {
return null;
}
/**
* Spout 组件的初始化方法
* 创建SentenceSpout组件的实例对象时调用,只执行一次
* @param conf
* @param context
* @param collector
*/
@Override
public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) {
// 用实例变量来接收tuple发射器
this.collector = collector;
}
/**
* close方法在该spout关闭前执行,但是并不能得到保证其一定被执行。
* spout是作为task运行在worker内,在cluster模式下,
* supervisor会直接kill -9 woker的进程,这样它就无法执行了。
* 而在本地模式下,只要不是kill -9, 如果是发送停止命令,
* 是可以保证close的执行的。
*/
@Override
public void close() {
// 收尾工作
}
/**
* 在对应时刻暂时激活spout
*/
@Override
public void activate() {
}
/**
* 在对应时刻暂时关闭spout
*/
@Override
public void deactivate() {
}
/**
* Spout组件的核心方法
* 循环调用
* 1)如何从数据源上获取数据 逻辑 写在该方法中
* 2)对获取的数据进行一些简单的处理
* 3) 封装tuple,并且向后面的bolt发射 (其实只能指定tuple的value值依次是什么)
*/
@Override
public void nextTuple() {
// 随机从数组中获取一一条语句(模拟从数据源中获取数据)
String sentence = SENTENCES[new Random().nextInt(SENTENCES.length)];
if(sentence.contains("error")){
logger.error("记录有问题:" + sentence);
}else{
// 封装成tuple
this.collector.emit(new Values(sentence));
}
try {
Thread.sleep(10000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
/**
* 传入的Object其实是一个id,唯一表示一个tuple。
* 该方法是这个id所对应的tuple被成功处理后执行
* @param msgId
*/
@Override
public void ack(Object msgId) {
}
/**
* 同ack,只不过是tuple处理失败时执行
* @param msgId
*/
@Override
public void fail(Object msgId) {
}
}
- 编写SplitBlot
package com.ibeifeng.bigdata.storm.topo;
import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.IRichBolt;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
import backtype.storm.tuple.Values;
import java.util.Map;
/**
* Bolt开发
* Created by ad on 2016/12/11.
*/
public class SplitBolt implements IRichBolt{
/**
* bolt组件中发射器
*/
private OutputCollector collector;
/**
* Bolt组件的初始化方法
*
* @param stormConf
* @param context
* @param collector
*/
@Override
public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
this.collector = collector;
}
/**
* 每接收到前面组件发射过来的tuple就调用一次
*
* bolt对数据处理逻辑写在该方法中
* 处理完后的数据封装成tuple(value部分),继续发射给后面的组件
* 或者执行比如写到数据库、打印到文件等等操作(终点)
*
* @param input
*/
@Override
public void execute(Tuple input) {
String sentence = input.getStringByField("sentence");
if(sentence != null && !"".equals(sentence)){
String[] words = sentence.split(" ");
for (String word: words){
this.collector.emit(new Values(word));
}
}
}
/**
* cleanup方法在bolt被关闭的时候调用, 它应该清理所有被打开的资源。
* 但是集群不保证这个方法一定会被执行。比如执行task的机器down掉了,
* 那么根本就没有办法来调用那个方法。cleanup设计的时候是被用来在
* local mode的时候才被调用(也就是说在一个进程里面模拟整个storm集群),
* 并且你想在关闭一些topology的时候避免资源泄漏
*/
@Override
public void cleanup() {
}
/**
* declareOutputFields定义一个叫做”word”的字段的
* 该bolt/spout输出的字段个数,供下游使用,在该bolt中的execute方法中,
* emit发射的字段个数必须和声明的相同,否则报错:Tuple created with wrong
* number of fields. Expected 2 fields but got 1 fields
* @param declarer
*/
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("word"));
}
@Override
public Map<String, Object> getComponentConfiguration() {
return null;
}
}
- 编写CountBlot
package com.ibeifeng.bigdata.storm.topo;
import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
import backtype.storm.tuple.Values;
import java.util.HashMap;
import java.util.Map;
/**
* 单词计数
* Created by ad on 2016/12/11.
*/
public class CountBolt extends BaseRichBolt {
private Map<String,Integer> counts;
/**
* bolt组件中发射器
*/
private OutputCollector collector;
/**
* Bolt组件的初始化方法
*
* @param stormConf
* @param context
* @param collector
*/
@Override
public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
this.collector = collector;
this.counts = new HashMap<>();
}
@Override
public void execute(Tuple input) {
String word = input.getStringByField("word");
// 单词的累计
int count = 1;
if(counts.containsKey(word)){
count = counts.get(word) + 1;
}
counts.put(word, count);
this.collector.emit(new Values(word, count));
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("word","count"));
}
}
- 编写PrintBlot
package com.ibeifeng.bigdata.storm.topo;
import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Tuple;
import java.util.Map;
/**
* Created by ad on 2016/12/11.
*/
public class PrintBolt extends BaseRichBolt{
@Override
public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
}
@Override
public void execute(Tuple input) {
String word = input.getStringByField("word");
Integer count = input.getIntegerByField("count");
System.err.println("单词:" + word + ", ----> 累计出现次数:"+ count);
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
}
}
6.编写测试程序WordCountTopology
package com.ibeifeng.bigdata.storm.topo;
import backtype.storm.Config;
import backtype.storm.LocalCluster;
import backtype.storm.StormSubmitter;
import backtype.storm.generated.AlreadyAliveException;
import backtype.storm.generated.InvalidTopologyException;
import backtype.storm.topology.TopologyBuilder;
import backtype.storm.tuple.Fields;
/**
* wordcountTopology
* Created by ad on 2016/12/11.
*/
public class WordCountTopology {
private static final String SPOUT_ID = "sentenceSpout";
private static final String SPLIT_BOLT = "splitBolt";
private static final String COUNT_BOLT = "countBolt";
private static final String PRINT_BOLT = "printBolt";
public static void main(String[] args) {
// 构造Topology
TopologyBuilder builder = new TopologyBuilder();
builder.setSpout(SPOUT_ID,new SentenceSpout()); // 指定 Spout
// 指定 SentenceSpout 向SplitBolt发射tuple 随机分组
builder.setBolt(SPLIT_BOLT, new SplitBolt()) //.localOrShuffleGrouping(SPOUT_ID)
.shuffleGrouping(SPOUT_ID);
builder.setBolt(COUNT_BOLT, new CountBolt()).fieldsGrouping(SPLIT_BOLT, new Fields("word"));
builder.setBolt(PRINT_BOLT, new PrintBolt())
.globalGrouping(COUNT_BOLT); // 全局分组
Config conf = new Config();
if(args == null || args.length == 0){
// 本地执行
LocalCluster localCluster = new LocalCluster();
localCluster.submitTopology("wordcount", conf ,builder.createTopology());
}else{
// 提交到集群上执行
conf.setNumWorkers(1); // 指定使用多少个进程来执行该Topology
try {
StormSubmitter.submitTopology(args[0],conf, builder.createTopology());
} catch (AlreadyAliveException e) {
e.printStackTrace();
} catch (InvalidTopologyException e) {
e.printStackTrace();
}
}
}
}