目录
1.4.3 提交任务后nimbus和zookeeper的目录树
Strom基本概念
以下资料整理自互联网
1.1 Storm 计算模型
1.2 WC经典例子
首先是输入日志文件:
package worldCount;
import java.util.Map;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Values;
public class worldCountSpout extends BaseRichSpout{
private static final long serialVersionUID = 1L;
private SpoutOutputCollector collector;
private OutputFieldsDeclarer declarer;
@Override
public void nextTuple() {
//发射模拟数据
//Values可以查看它的类实现,实际就是一个动态数组
collector.emit(new Values("i am ximen love jinlian"));//整个句子声明为love字段
//休眠
try {
Thread.sleep(2000);
}catch(InterruptedException e) {
e.printStackTrace();
}
}
// 主要是功能函数nextTuple(往下传什么函数)和声明字段函数,声明字段便于后续操作
@Override
public void open(Map arg0, TopologyContext arg1, SpoutOutputCollector arg2) {
// TODO Auto-generated method stub
this.collector=collector;
}
@Override
public void declareOutputFields(OutputFieldsDeclarer arg0) {
// TODO Auto-generated method stub
declarer.declare(new Fields("love"));
}
}
其次是划分句子:
package worldCount;
import java.util.Map;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
public class wordCountSplitBolt extends BaseRichBolt {
private static final long serialVersionUID=1L;
private OutputCollector collector;
private OutputFieldsDeclarer declarer;
@Override //执行方法是重点
public void execute(Tuple input) {
// TODO Auto-generated method stub
//因为是实时数据,所以数据是一行一行的传过来的
String line = input.getString(0); //上一个Spout只传了一个字段,所以第0号字段的位置
String[] arrWords = line.split(" ");//空格分隔符
//切完后发射出去
for(String word:arrWords) {
collector.emit(new Values(word,1));
}
}
@Override
public void prepare(Map arg0, TopologyContext arg1, OutputCollector arg2) {
// TODO Auto-generated method stub
this.collector = collector;
}
@Override
public void declareOutputFields(OutputFieldsDeclarer arg0) {
// TODO Auto-generated method stub
declarer.declare(new Fields("word","num"));//.emit(word,1) k-v 格式发送出去
}
}
切完日志后要统计:
package worldCount;
import java.util.HashMap;
import java.util.Map;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Tuple;
public class wordCountSumBolt extends BaseRichBolt {
private static final long serialVersionUID = 1L;
private Map<String,Integer> map = new HashMap<String,Integer>();
@Override
public void execute(Tuple input) {
// TODO Auto-generated method stub
//获取传过来的参数
String word = input.getString(0);
Integer num = input.getInteger(1);
//累加次数
if(map.containsKey(word)) {
Integer count = map.get(word);
map.put(word, count+num);
}else {
map.put(word, num);
}
System.err.println(Thread.currentThread().getId()+" word:"+word+
" num:"+map.get(word));
}
@Override
public void prepare(Map arg0, TopologyContext arg1, OutputCollector arg2) {
// TODO Auto-generated method stub
}
@Override
public void declareOutputFields(OutputFieldsDeclarer arg0) {
// TODO Auto-generated method stub //因为不往下一级传送数据,所以不用申明字段
}
}
最后主程序:
package worldCount;
import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.StormSubmitter;
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.tuple.Fields;
public class wordCountMain {
public static void main(String[] args) {
TopologyBuilder builder = new TopologyBuilder();
builder.setSpout("wordCountSpout",new worldCountSpout(),1);
builder.setBolt("wordCountSplit", new wordCountSplitBolt(),2).shuffleGrouping("wordCountSpout"); //这个分发机制是对流入这个bolt的分发策略,2个线程
builder.setBolt("wordCountSum", new wordCountSumBolt(),4).fieldsGrouping("wordCountSplit",new Fields("word"));
//这个分发机制是对流入这个bolt的分发策略,这里一定要是按字段分发,因为如果不是的话同一个单词去不同bolt,那必须还的加一个汇总的bolt。4个线程
Config conf = new Config();
conf.setNumWorkers(2);
//提交任务
if(args.length>0) {
try {
StormSubmitter.submitTopology(args[0],conf,builder.createTopology()); //集群提交
}catch(Exception e) {
e.printStackTrace();
}
}else {
LocalCluster localCluster = new LocalCluster(); //本地提交
localCluster.submitTopology("wordCountTopology",conf, builder.createTopology());
}
}
}
补充统计日志文件中ID次数(使用IO管道来打开文件,记得关闭):
package com.sxt.storm.grouping;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.Map;
import backtype.storm.spout.SpoutOutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.IRichSpout;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Values;
public class MySpout implements IRichSpout {
private static final long serialVersionUID = 1L; //用于设置序列号使用
FileInputStream fis;
InputStreamReader isr; //IO流
BufferedReader br;
SpoutOutputCollector collector = null;
String str = null;
@Override
public void nextTuple() {
try {
while ((str = this.br.readLine()) != null) {
// 过滤动作
collector.emit(new Values(str, str.split("\t")[1]));
}
} catch (Exception e) {
}
}
@Override
public void close() {
try {
br.close();
isr.close();
fis.close(); //管道每次使用完毕要记得关闭
} catch (Exception e) {
e.printStackTrace();
}
}
@Override
public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) {
try {
this.collector = collector;
this.fis = new FileInputStream("track.log");
this.isr = new InputStreamReader(fis, "UTF-8");
this.br = new BufferedReader(isr); //使用IO的管道来快速读取数据
} catch (Exception e) {
e.printStackTrace();
}
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("log", "session_id"));
}
@Override
public Map<String, Object> getComponentConfiguration() {
return null;
}
@Override
public void ack(Object msgId) {
System.out.println("spout ack:" + msgId.toString());
}
@Override
public void activate() {
}
@Override
public void deactivate() {
}
@Override
public void fail(Object msgId) {
System.out.println("spout fail:" + msgId.toString());
}
}
package com.sxt.storm.grouping;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.Map;
import backtype.storm.spout.SpoutOutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.IRichSpout;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Values;
public class MySpout implements IRichSpout {
private static final long serialVersionUID = 1L; //用于设置序列号使用
FileInputStream fis;
InputStreamReader isr; //IO流
BufferedReader br;
SpoutOutputCollector collector = null;
String str = null;
@Override
public void nextTuple() {
try {
while ((str = this.br.readLine()) != null) {
// 过滤动作
collector.emit(new Values(str, str.split("\t")[1]));
}
} catch (Exception e) {
}
}
@Override
public void close() {
try {
br.close();
isr.close();
fis.close(); //管道每次使用完毕要记得关闭
} catch (Exception e) {
e.printStackTrace();
}
}
@Override
public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) {
try {
this.collector = collector;
this.fis = new FileInputStream("track.log");
this.isr = new InputStreamReader(fis, "UTF-8");
this.br = new BufferedReader(isr); //使用IO的管道来快速读取数据
} catch (Exception e) {
e.printStackTrace();
}
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("log", "session_id"));
}
@Override
public Map<String, Object> getComponentConfiguration() {
return null;
}
@Override
public void ack(Object msgId) {
System.out.println("spout ack:" + msgId.toString());
}
@Override
public void activate() {
}
@Override
public void deactivate() {
}
@Override
public void fail(Object msgId) {
System.out.println("spout fail:" + msgId.toString());
}
}
package com.sxt.storm.grouping;
import backtype.storm.Config;
import backtype.storm.LocalCluster;
import backtype.storm.StormSubmitter;
import backtype.storm.generated.AlreadyAliveException;
import backtype.storm.generated.InvalidTopologyException;
import backtype.storm.topology.TopologyBuilder;
import backtype.storm.tuple.Fields;
public class Main {
/**
* @param args
*/
public static void main(String[] args) {
TopologyBuilder builder = new TopologyBuilder();
builder.setSpout("spout", new MySpout(), 1);
// shuffleGrouping其实就是随机往下游去发,不自觉的做到了负载均衡,总共80条数据,
//一个线程40条,所以即使相同的日志可能会被分到不同的线程中
// builder.setBolt("bolt", new MyBolt(), 2).shuffleGrouping("spout");
// fieldsGrouping其实就是MapReduce里面理解的Shuffle,根据fields求hash来取模
//根据指定字段来分发,每个线程(即每个bolt线程)中打印的ID一定是一样的。
// builder.setBolt("bolt", new MyBolt(), 2).fieldsGrouping("spout", new Fields("session_id"));
// 只往一个里面发,往taskId小的那个里面去发送(线程小的执行)
// builder.setBolt("bolt", new MyBolt(), 2).globalGrouping("spout");
// 等于shuffleGrouping
// builder.setBolt("bolt", new MyBolt(), 2).noneGrouping("spout");
// 广播
builder.setBolt("bolt", new MyBolt(), 2).allGrouping("spout");
// Map conf = new HashMap();
// conf.put(Config.TOPOLOGY_WORKERS, 4);
Config conf = new Config();
conf.setDebug(false);
conf.setMessageTimeoutSecs(30);
if (args.length > 0) {
try {
StormSubmitter.submitTopology(args[0], conf, builder.createTopology());
} catch (AlreadyAliveException e) {
e.printStackTrace();
} catch (InvalidTopologyException e) {
e.printStackTrace();
}
} else {
LocalCluster localCluster = new LocalCluster();
localCluster.submitTopology("mytopology", conf, builder.createTopology());
}
}
}
补充:1.在写实际业务时不知道要实现哪些函数,就去看源码。2.在windows上配置环境时记得添加jar包。
注意上面的WC例子是原来编写的,统计日志文件中ID次数才是下面配置方法编写代码的配置,这样不用手动添加import 对应的包,比较方便,且添加后自动出现对应的抽象方法。
1.3 window上调试storm代码环境配置
通过上面的方法能直接得到对应的抽象方法,后面直接实例化即可实现业务逻辑。同理:bolt是BaseRichBolt.
如果不知道每个方法的参数是什么,可以查看源码。 如下:
1.4 Storm 架构
通过zookeeper作用来弱化nimbus的作用。Worker里面执行spout和bolt。
1.4.1 hadoop与Strom的对比
| Hadoop | Storm |
主节点 | ResourceManager | Nimbus |
从节点 | NodeManager | Supervisor |
应用程序 | Job | Topology |
工作进程 | Child | Worker |
计算模型 | Map/Reduce(split,map,shuffle,reduce) | Spout/Bolt |
1.4.2 Storm任务提交流程
其中,nimbus将任务所需的全部上传zookeeper就完成了自身在本任务上的使命。nimbus相当于马云,zookeeper相当于逍遥子。
1.4.3 提交任务后nimbus和zookeeper的目录树
1.5 Storm的部署
注意伪分布式(单机)安装时自带zookeeper,而完全分布式则需要安装zookeeper。
一、环境要求
JDK 1.6+
java -version
Python 2.6.6+
python -V
ZooKeeper3.4.5+
storm 0.9.4+
--------------------------------------------------------------------
二、单机模式
##单机模式即为伪分布式,不需要单独安装zookeeper,是自带的。
##上传解压
$ tar xf apache-storm-0.9.4.tar.gz
$ cd apache-storm-0.9.4
## storm安装目录下创建log,用来追踪错误信息,排查原因。2>&1中的2表示错误信息,1表示指定的输出;&表示
## 后台运行。
$ mkdir logs
$ ./bin/storm --help
##下面分别启动ZooKeeper、Nimbus、UI、supervisor、logviewer
### 不会指令时, storm 就会出现各种指令,使用storm help 指令 就能查看使用指南
$ ./bin/storm dev-zookeeper >> ./logs/zk.out 2>&1 &
$ ./bin/storm nimbus >> ./logs/nimbus.out 2>&1 &
$ ./bin/storm ui >> ./logs/ui.out 2>&1 &
$ ./bin/storm supervisor >> ./logs/supervisor.out 2>&1 &
$ ./bin/storm logviewer >> ./logs/logviewer.out 2>&1 &
##需要等一会儿
$ jps
6966 Jps
6684 logviewer
6680 dev_zookeeper
6681 nimbus
6682 core
6683 supervisor
## node01为对应的主机IP映射,先看是否通信端口已经准备就绪:ss -nal
## 启动后浏览器: http://node01:8080
## 提交任务到Storm集群当中运行:
$ ./bin/storm jar examples/storm-starter/storm-starter-topologies-0.9.4.jar storm.starter.WordCountTopology wordcount
$ ./bin/storm jar examples/storm-starter/storm-starter-topologies-0.9.4.jar storm.starter.WordCountTopology test
-------------------------------------------------------------------------------------------
三、完全分布式安装部署
##各节点分配:
Nimbus Supervisor Zookeeper
node1 1 1
node2 1 1
node3 1 1
##开始配置(下面的目录是Strom目录下的对应目录)
$ vim conf/storm.yaml
storm.zookeeper.servers:
- "node1"
- "node2"
- "node3"
storm.local.dir: "/tmp/storm" ##指定存放任务信息的临时性目录,好处是关机就清理了
nimbus.host: "node1" ####node1作为nimbus,主节点编号
##四个worker的通信端口 很严格,多个空格都会错
supervisor.slots.ports:
- 6700
- 6701
- 6702
- 6703
##上面的配置信息一定要小心,所有都是紧靠左边编写。
##在storm目录中创建logs目录,用来存储错误信息
$ mkdir logs
##配置环境变量,记得改对应版本号
#STORM_HOME
export STORM_HOME=/home/hadoop/app/apache-storm-1.1.0
export PATH=$PATH:$STORM_HOME/bin
##(分发)集群其他服务器
$ scp -r storm/ centos71:/home/hzq/software/
$ scp -r /etc/profile 主机名(IP):/etc/
##启动ZooKeeper集群
$ bin/zkServer.sh start
## jps查看: QuorumPeerMain 有这个进程表示成功。
$ bin/zkServer.sh status ##这个查看节点对应状态即是否为主节点。
##node1上启动Nimbus
$ storm nimbus >> ./logs/nimbus.out 2>&1 &
##后台启动,记得在storm主目录
$ tail -f logs/nimbus.log
$ storm ui >> ./logs/ui.out 2>&1 &
$ tail -f logs/ui.log
节点node2和node3启动supervisor,按照配置,每启动一个supervisor就有了4个slots
$ storm supervisor >> ./logs/supervisor.out 2>&1 &
$ tail -f logs/supervisor.log
(当然,node1也可以启动supervisor)
http://node1:8080/
##提交任务到Storm集群当中运行:
## storm jar jar包 类的包名.类 任务别名
$ storm jar examples/storm-starter/storm-starter-topologies-0.9.4.jar storm.starter.WordCountTopology test
1.6 Storm并发机制和容错机制
1.6.1 基本概念
1.6.2 进程线程和任务的设置
2个进程(worker),10个线程(excutor,每个线程可以执行多个任务,如这里每个线程执行2个task),12个task(一般说的task不包含确认机制中task)。
再如:
总共2个进程,8个线程excuter,10个任务(不算ACK的任务),总的线程和任务均分到每个进程上,如下图所示。其中这里tasks为12是因为还有确认机制的2个task。
1.6.3 进程和线程的动态调整(任务数无法调整)
补充:快速关掉zookeeper和storm的命令: Killall java
1.6.4 Storm的容错机制
ack的容错机制,可以看源代码的说明:
1.6.5 事物