目录
1、案例一:模拟统计日志文件中sesson_id个数(不去重)
一、API简介
storm主要以继承BaseRichSpout抽象类、BaseRichBolt抽象类 实现IRichSpout接口、IRichBolt接口两种方法
继承抽象类方法较少,简洁清晰;若想实现更多方法,可以实现接口
Spout方法简介
(1)Open()
是初始化方法
(2)close()
在该spout关闭前执行,但是并不能得到保证其一定被执行,kill -9时不执行,Storm kill {topoName} 时执行
(3)activate()
当Spout已经从失效模式中激活时被调用。该Spout的nextTuple()方法很快就会被调用。
(4)deactivate ()
当Spout已经失效时被调用。在Spout失效期间,nextTuple不会被调用。Spout将来可能会也可能不会被重新激活。
(5)nextTuple()
当调用nextTuple()方法时,Storm要求Spout发射元组到输出收集器(OutputCollecctor)。nextTuple()、ack()和fail()方法都在Spout任务的单一线程内紧密循环被调用。
(6)ack()
成功处理tuple回调方法
(7)fail()
处理失败tuple回调方法
bolt方法简介
(1)prepare()
prepare ()方法在集群的工作进程内被初始化时被调用,提供了Bolt执行所需要的环境。
(2)execute()
接受一个tuple进行处理,也可emit数据到下一级组件。
(3)cleanup()
Cleanup方法当一个IBolt即将关闭时被调用。
二、依赖
<!--storm相关jar -->
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>storm-core</artifactId>
<version>1.1.2</version>
<!-- <scope>provided</scope> idea客户端不需要,linux上运行取消注释 -->
</dependency>
三、案例
1、案例一:模拟统计日志文件中sesson_id个数(不去重)
(1)模拟随机生成log的类
package com.atg;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Random;
/**
* 自己写日志信息的类
*/
public class GenerateData {
public static void main(String[] args) throws Exception{
//1、指定log文件
File logFile = new File("E:/hadoop/hdpdata/logData/website.log");
//2、定义日志内容
// 2.1 网站名称
String[] hosts = { "www.baidu.com" };
// 2.2 会话id
String[] session_id = { "ABYH6Y4V4SCVXTG6DPB4VH9U123", "XXYH6YCGFJYERTT834R52FDXV9U34",
"BBYH61456FGHHJ7JL89RG5VV9UYU7", "CYYH6Y2345GHI899OFG4V9U567", "VVVYH6Y4V4SFXZ56JIPDPB4V678" };
// 2.3 访问网站时间
String[] time = { "2017-08-07 08:40:50", "2017-08-07 08:40:51", "2017-08-07 08:40:52", "2017-08-07 08:40:53",
"2017-08-07 09:40:49", "2017-08-07 10:40:49", "2017-08-07 11:40:49", "2017-08-07 12:40:49" };
//2.4 ip地址
String[] ip ={"192.168.1.1"};
//3、拼接
Random random = new Random();
StringBuffer sb = new StringBuffer();
for(int i=0;i<500;i++){
sb.append(hosts[0]+"\t"+session_id[random.nextInt(5)]+"\t"
+time[random.nextInt(8)]+"\t"+ip[0]+random.nextInt(30)+"\n");
}
//4、判断文件是否存在
if(!logFile.exists()){
try {
logFile.createNewFile();
} catch (IOException e) {
e.printStackTrace();
}
}
//5、写入日志内容
FileOutputStream fos=null;
try {
fos = new FileOutputStream(logFile);
fos.write(sb.toString().getBytes());
} catch (Exception e) {
e.printStackTrace();
}finally {
//关闭资源
try {
fos.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
(2)spout类 负责将文件中一行一行发送给bolt
package com.atg;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Values;
import java.io.*;
import java.util.Map;
public class WebLogSpout extends BaseRichSpout {
private BufferedReader reader;
//SpoutOutputCollector 收集器,向bolt发射数据,全局是因为nexttuole用
private SpoutOutputCollector collector;
private String str;
//实现方法,spout的核心
public void nextTuple() {
try {
while ((str = reader.readLine() )!= null){
collector.emit(new Values(str)); //一次发射一行数据
Thread.sleep(500);
}
}catch (Exception e){
e.printStackTrace();
}
}
//初始化,打开资源
public void open(Map map, TopologyContext topologyContext, SpoutOutputCollector collector) {
this.collector=collector;
//读取文件
try {
reader = new BufferedReader(new InputStreamReader(new FileInputStream(
"E:/hadoop/hdpdata/logData/website.log")));
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
//声明字段(分组) 起到过滤作用,bolt可以指定接收fields的数据
declarer.declare(new Fields("log"));
}
}
(3)bolt类 负责获取数据并且切分,并统计出sesson_id个数
package com.atg;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Tuple;
import java.util.Map;
public class WebLogBolt extends BaseRichBolt {
private int num;
//执行
public void execute(Tuple tuple) {
//1 获取数据
// String log = tuple.getStringByField("log"); //通过feild获取 与spout声明字段对应
String log = tuple.getString(0); //Values(str0,str1) -->str0偏移量是0,str1是1
//2 切割数据,获取session_id
String[] split = log.split("\t");
String session_id =split[1];
//统计多少个session——id
if(session_id !=null && session_id !="") {
num++;
System.out.println("线程:"+Thread.currentThread().getId()
+ "----session_id:" + session_id + " 共" + num + "个");
}
}
//准备工作
public void prepare(Map map, TopologyContext topologyContext, OutputCollector outputCollector) {
}
//声明
public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
}
}
(4)Topology类
package com.atg;
import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.StormSubmitter;
import org.apache.storm.topology.TopologyBuilder;
public class WebLogMain {
public static void main(String[] args) {
//1 创建拓扑
TopologyBuilder builder = new TopologyBuilder();
//流的来源 id自己设置,1是并行度(1个线程)若2,则发送两倍数量
builder.setSpout("WebLogSpout", new WebLogSpout(), 2);
//shuffgrouping分组,()内是上一个数据来源,shuff轮询,平均分配。个数2则平均分配(有误差),2个线程
builder.setBolt("WebLogBlot", new WebLogBolt(), 1)
.shuffleGrouping("WebLogSpout");
//2 创建配置信息对象,配置worker个数
Config config = new Config();
config.setNumWorkers(2);
//3 提交程序
if (args.length > 0) { //集群提交 name自己设置
try {
StormSubmitter.submitTopology(args[0], config, builder.createTopology());
} catch (Exception e) {
e.printStackTrace();
}
} else { //本地提交
LocalCluster localCluster = new LocalCluster();
localCluster.submitTopology("webtopology", config, builder.createTopology());
}
}
}
2、案例二:实时计算网站PV
package com.pv;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Values;
import java.io.*;
import java.util.Map;
public class PvSpout extends BaseRichSpout {
SpoutOutputCollector _collector;
BufferedReader reader;
String line;
public void open(Map map, TopologyContext topologyContext, SpoutOutputCollector collector) {
_collector=collector;
try {
reader = new BufferedReader(new InputStreamReader(new FileInputStream
("E:/hadoop/hdpdata/logData/website.log"),"UTF-8"));
} catch (Exception e) {
e.printStackTrace();
}
}
public void nextTuple() {
try{
if((line = reader.readLine()) !=null ){
_collector.emit(new Values(line));
Thread.sleep(1000);
}
}catch (Exception e){
e.printStackTrace();
}
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("line"));
}
public void close() {
try {
if (reader != null) {
reader.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
package com.pv;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import java.util.Map;
public class PvSplitBolt extends BaseRichBolt {
OutputCollector _collector;
long pv=0;
public void prepare(Map map, TopologyContext topologyContext, OutputCollector collector) {
_collector=collector;
}
public void execute(Tuple tuple) {
String data = tuple.getString(0);
String session_id = data.split("\t")[1];
if(session_id != null){
pv++;
}
_collector.emit(new Values(Thread.currentThread().getId(),pv));
System.err.println("thread id:" + Thread.currentThread().getId() + " pv:" + pv);
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("threadID","pv"));
}
}
package com.pv;
import clojure.lang.Var;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
public class PVSumBolt extends BaseRichBolt {
Map<Long,Long> map = new HashMap<Long,Long>();
public void prepare(Map map, TopologyContext topologyContext, OutputCollector outputCollector) {
}
public void execute(Tuple tuple) {
long threadID = tuple.getLong(0);
long PV = tuple.getLong(1);
map.put(threadID,PV);
long word_sum=0;
Iterator<Long> iterator = map.values().iterator();
while(iterator.hasNext()){
word_sum += iterator.next();
}
System.err.println("pv_all:" + word_sum);
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
}
}
package com.pv;
import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.StormSubmitter;
import org.apache.storm.topology.TopologyBuilder;
public class PvMain {
public static void main(String[] args) {
TopologyBuilder builder = new TopologyBuilder();
builder.setSpout("PVSpout", new PvSpout(), 1);
builder.setBolt("PvSplitBolt", new PvSplitBolt(), 4).shuffleGrouping("PVSpout");
builder.setBolt("PVSumBolt", new PVSumBolt(), 1).shuffleGrouping("PvSplitBolt");
Config conf = new Config();
conf.setNumWorkers(2);
if (args.length > 0) {
try {
StormSubmitter.submitTopology(args[0], conf, builder.createTopology());
} catch (Exception e) {
e.printStackTrace();
}
}else {
LocalCluster cluster = new LocalCluster();
cluster.submitTopology("pvtopology", conf, builder.createTopology());
}
}
}
3、案例三:wordcount
spout类
package com.wc;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Values;
import org.apache.storm.utils.Utils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Map;
import java.util.Random;
public class SentenceSpout extends BaseRichSpout {
private static final Logger logger = LoggerFactory.getLogger(SentenceSpout.class);
SpoutOutputCollector _collector;
Random _rand;
public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) {
_collector = collector;
_rand = new Random();
}
public void nextTuple() {
Utils.sleep(1000);
String[] sentences = new String[]{
"1,2019092410010000,100.00",
"And nodding by the fire,take down this book",
"And slowly read,and dream of the soft look",
"Your eyes had once,and of their shadows deep",
"How many loved your moments of glad grace",
"And loved your beauty with love false or true"
};
final String sentence = sentences[_rand.nextInt(sentences.length)];
logger.info("Emitting tuple: "+ sentence);
_collector.emit(new Values(sentence));
}
@Override
public void ack(Object id) {
logger.info("ack id:" + id);
}
@Override
public void fail(Object id) {
logger.info("fail id:" + id);
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("word"));
}
}
SplitSentenceBolt类
package com.wc;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.IRichBolt;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Map;
public class SplitSentenceBolt implements IRichBolt {
private static final Logger logger = LoggerFactory.getLogger(SplitSentenceBolt.class);
private OutputCollector _collector;
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("word"));
}
public Map<String, Object> getComponentConfiguration() {
return null;
}
public void prepare(Map stormConf, TopologyContext context,
OutputCollector collector) {
_collector = collector;
}
public void execute(Tuple tuple) {
logger.info(tuple.toString());
String sentence = tuple.getStringByField("word");
logger.info("sentence:" + sentence);
String[] words = sentence.split(" ");
for (String word : words) {
this._collector.emit(new Values(word));
}
}
public void cleanup() {
logger.info("cleanup");
}
}
WordCountBolt类
package com.wc;
import org.apache.storm.topology.BasicOutputCollector;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseBasicBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.HashMap;
import java.util.Map;
public class WordCountBolt extends BaseBasicBolt {
private static final Logger logger = LoggerFactory.getLogger(WordCountBolt.class);
Map<String, Integer> wordCountMap = new HashMap<String, Integer>();
public void execute(Tuple tuple, BasicOutputCollector collector) {
logger.info(tuple.toString());
String word = tuple.getString(0);
Integer count = wordCountMap.get(word);
count = (count == null) ? 0 : count;
count++;
wordCountMap.put(word, count);
logger.info("word:" + word + " count:" + count);
collector.emit(new Values(word, count));
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("word", "count"));
}
}
WordSumBolt类
package com.wc;
import org.apache.storm.topology.BasicOutputCollector;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseBasicBolt;
import org.apache.storm.tuple.Tuple;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.*;
public class WordSumBolt extends BaseBasicBolt {
private static final Logger logger = LoggerFactory.getLogger(WordSumBolt.class);
Map<String, Integer> counts = new HashMap<String, Integer>();
public void execute(Tuple tuple, BasicOutputCollector collector) {
String word = tuple.getStringByField("word");
Integer count = tuple.getIntegerByField("count");
logger.info("word:" + word + " count:" + count);
this.counts.put(word, count);
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
}
@Override
public void cleanup() {
logger.info("-----------------汇总结果-----------------------");
List<String> keys = new ArrayList<String>();
keys.addAll(this.counts.keySet());
Collections.sort(keys);
for (String key : keys) {
logger.info(key + " : " + this.counts.get(key));
}
}
}
WordCountTopology类
package com.wc;
import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.StormSubmitter;
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.tuple.Fields;
public class WordCountTopology {
public static void main(String[] args) throws Exception {
TopologyBuilder builder = new TopologyBuilder();
builder.setSpout("spout", new SentenceSpout(), 5);
//ShuffleGrouping:随机选择一个Task
builder.setBolt("split", new SplitSentenceBolt(), 8).shuffleGrouping("spout");
//FiledGrouping:根据Tuple中Fields来做一致性hash,相同hash值的Tuple被发送到相同的Task。
builder.setBolt("count", new WordCountBolt(), 12).fieldsGrouping("split", new Fields("word"));
//GlobalGrouping:所有的Tuple会被发送到某个Bolt中的id最小的那个Task。
builder.setBolt("sum", new WordSumBolt(), 6).globalGrouping("count");
Config conf = new Config();
conf.setDebug(true);
if (args != null && args.length > 0) {
conf.setNumWorkers(3);
StormSubmitter.submitTopologyWithProgressBar(args[0], conf, builder.createTopology());
}
else {
conf.setMaxTaskParallelism(3);
LocalCluster cluster = new LocalCluster();
cluster.submitTopology("word-count", conf, builder.createTopology());
Thread.sleep(10*60*1000);
cluster.shutdown();
}
}
}