参考:http://shiyanjun.cn/archives/934.html
1 zookeeper安装
zookeeper的安装很简单,只需要解压后,修改下zoo.cfg,配置dataDir和server.1=节点1:2888:3888
server.2=节点2:2888:3888
server.3=节点3:2888:3888
然后在每个节点指定的datadir下建立myid文件并写入编号
同时注意修改每个节点的hosts文件
启动命令,需要三个节点分别启动:./zkServer.sh start
查看状态 ./zkServer.sh status
执行创建节点的命令 ./zkCli.sh
比如创建kafka用的目录 /kafka
2 storm安装, 可以看单独介绍此安装的那篇
3 kafka安装
vim server.propertities
broker.id=三个节点分别是0,1,2
zookeeper.connect=zk节点1:2181,zk节点2:2181,zk节点3:2181/kafka
/kafka 是因为默认会建立在zk根目录下,为了跟其它共用zk集群的区分开,放在kafka目录下
/kafka需要去zk中创建 ./zkCli.sh 进入命令行 执行 create /kafka ''即可
启动命令,需要三个节点分别启动:./kafka-server-start.sh server.properties路径 &
自带了生成消息和消费消息的脚本,可以做测试用
kafka-console-producer.sh kafka-console-consumer.sh
我们创建一个名称为my-test-topic5的Topic,5个分区,并且复制因子为3,执行如下命令:
bin/kafka-topics.sh --create --zookeeper zk节点1:2181,zk节点2:2181,zk节点3:2181/kafka --replication-factor 3 --partitions 5 --topic my-test-topic5
查看创建的Topic,执行如下命令:
bin/kafka-topics.sh --describe --zookeeper zk节点1:2181,zk节点2:2181,zk节点3:2181/kafka --topic my-test-topic5
我们可以通过Kafka自带的bin/kafka-console-producer.sh和bin/kafka-console-consumer.sh脚本,来验证演示如果发布消息、消费消息。
在一个终端,启动Producer,并向我们上面创建的名称为my-test-topic5的Topic中生产消息,执行如下脚本:
bin/kafka-console-producer.sh --broker-list kafka节点1:9092,kafka节点2:9092,kafka节点3:9092 --topic my-test-topic5
在另一个终端,启动Consumer,并订阅我们上面创建的名称为my-test-topic5的Topic中生产的消息,执行如下脚本
bin/kafka-console-consumer.sh --zookeeper zk节点1:2181,zk节点2:2181,zk节点3:2181/kafka --from-beginning --topic my-test-topic5
可以在Producer终端上输入字符串消息行,然后回车,就可以在Consumer终端上看到消费者消费的消息内容。
4 kafka与storm集成
建一个maven工程,pom.xml如下:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.test</groupId>
<artifactId>stormnew</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>storm-core</artifactId>
<version>0.9.5</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>storm-kafka</artifactId>
<version>0.9.5</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_2.10</artifactId>
<version>0.8.2.1</version>
<exclusions>
<exclusion>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
</project>
建一个拓扑类,内容参考简单之美的文章:
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import storm.kafka.BrokerHosts;
import storm.kafka.KafkaSpout;
import storm.kafka.SpoutConfig;
import storm.kafka.StringScheme;
import storm.kafka.ZkHosts;
import backtype.storm.Config;
import backtype.storm.LocalCluster;
import backtype.storm.StormSubmitter;
import backtype.storm.generated.AlreadyAliveException;
import backtype.storm.generated.InvalidTopologyException;
import backtype.storm.spout.SchemeAsMultiScheme;
import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.TopologyBuilder;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
import backtype.storm.tuple.Values;
public class MyKafkaTopology {
public static class KafkaWordSplitter extends BaseRichBolt {
private static final Log LOG = LogFactory.getLog(KafkaWordSplitter.class);
private static final long serialVersionUID = 886149197481637894L;
private OutputCollector collector;
@Override
public void prepare(Map stormConf, TopologyContext context,
OutputCollector collector) {
this.collector = collector;
}
@Override
public void execute(Tuple input) {
String line = input.getString(0);
LOG.info("RECV[kafka -> splitter] " + line);
String[] words = line.split("\\s+");
for(String word : words) {
LOG.info("EMIT[splitter -> counter] " + word);
collector.emit(input, new Values(word, 1));
}
collector.ack(input);
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("word", "count"));
}
}
public static class WordCounter extends BaseRichBolt {
private static final Log LOG = LogFactory.getLog(WordCounter.class);
private static final long serialVersionUID = 886149197481637894L;
private OutputCollector collector;
private Map<String, AtomicInteger> counterMap;
@Override
public void prepare(Map stormConf, TopologyContext context,
OutputCollector collector) {
this.collector = collector;
this.counterMap = new HashMap<String, AtomicInteger>();
}
@Override
public void execute(Tuple input) {
String word = input.getString(0);
int count = input.getInteger(1);
LOG.info("RECV[splitter -> counter] " + word + " : " + count);
AtomicInteger ai = this.counterMap.get(word);
if(ai == null) {
ai = new AtomicInteger();
this.counterMap.put(word, ai);
}
ai.addAndGet(count);
collector.ack(input);
LOG.info("CHECK statistics map: " + this.counterMap);
}
@Override
public void cleanup() {
LOG.info("The final result:");
Iterator<Entry<String, AtomicInteger>> iter = this.counterMap.entrySet().iterator();
while(iter.hasNext()) {
Entry<String, AtomicInteger> entry = iter.next();
LOG.info(entry.getKey() + "\t:\t" + entry.getValue().get());
}
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("word", "count"));
}
}
public static void main(String[] args) throws AlreadyAliveException, InvalidTopologyException, InterruptedException {
String zks = "zk主节点1:2181,zk主节点2:2181,zk主节点3:2181/kafka";//指定的是kafka集群 的zk地址
String topic = "my-test-topic5";
String zkRoot = "/storm"; // default zookeeper root configuration for storm
String id = "word";
BrokerHosts brokerHosts = new ZkHosts(zks,"/brokers");
SpoutConfig spoutConf = new SpoutConfig(brokerHosts, topic, zkRoot, id);//因为zookeeper集群地址在storm集群的配置文件中已经指定,不需要在程序中指定
spoutConf.scheme = new SchemeAsMultiScheme(new StringScheme());
spoutConf.forceFromStart = false;
spoutConf.zkServers = Arrays.asList(new String[] {"slave170", "slave171", "slave172"});
spoutConf.zkPort = 2181;
TopologyBuilder builder = new TopologyBuilder();
builder.setSpout("kafka-reader", new KafkaSpout(spoutConf), 5); // Kafka我们创建了一个5分区的Topic,这里并行度设置为5
builder.setBolt("word-splitter", new KafkaWordSplitter(), 2).shuffleGrouping("kafka-reader");
builder.setBolt("word-counter", new WordCounter()).fieldsGrouping("word-splitter", new Fields("word"));
Config conf = new Config();
String name = MyKafkaTopology.class.getSimpleName();
if (args != null && args.length > 0) {
// Nimbus host name passed from command line
conf.put(Config.NIMBUS_HOST, args[0]);
conf.setNumWorkers(3);
StormSubmitter.submitTopologyWithProgressBar(name, conf, builder.createTopology());
} else {
conf.setMaxTaskParallelism(3);
LocalCluster cluster = new LocalCluster();
cluster.submitTopology(name, conf, builder.createTopology());
Thread.sleep(60000);
cluster.shutdown();
}
}
}
使用idea的maven操作界面打包,并上传到storm集群所在主节点
(1)任务提交
执行storm jar stormnew-1.0-SNAPSHOT.jar MyKafkaTopology storm主节点名称--主机点名称是必需的
在开始遇到一系列报错,依次通过往storm的lib中加jar解决,具体报错及对应jar包会单独整理一篇。
最后需要重启下storm集群,成功的话会提示成功提交topology
在8080web界面可以看到对应的topology
(2)接收kafka消息
启动kafka自带的producer脚本,bin/kafka-console-producer.sh --broker-list kafka节点1:9092,kafka节点2:9092,kafka节点3:9092 --topic my-test-topic5
输入消息,回车,看下storm中的worker日志,会有对应的输出,对应拓扑代码中的log.info
我的storm集群共12个worker可以用,在测试的过程中发现一直是使用了3个worker,另外9个空闲,且这3个worker可能会在1个节点上启动,也可能会分散个三个节点启动。
通过jps及8080页面都能看到在哪个节点上启动了
根据storm配置文件的设置,4个worker对应端口分别是6700,6701,6702,6703
对应的日志文件将是work-670X.log