介绍
今天来实现一个Storm数据流处理综合案例的第一部分,Storm集群向Kafka集群源源不断写入数据,并部署为远程模式
准备工作
搭建三台Kafka集群服务器
搭建三台Storm集群服务器
启动Kafka集群
启动Zookeeper
Zookeeper启动需要一定时间,建议等一分钟再操作其他命令
cd /usr/local/kafka/zookeeper
./bin/zkServer.sh start
启动Kafka集群
cd /usr/local/kafka/kafka
./bin/kafka-server-start.sh -daemon ./config/server.properties
查看启动的进程
注:jps命令需要安装JDK环境
jps
查看Topic
我们使用kafkatopic主题接受消息
cd /usr/local/kafka/kafka
./bin/kafka-topics.sh --list --bootstrap-server 192.168.217.151:9092
启动Storm集群
启动Zookeeper
zkServer.sh start
Zookeeper启动需要一定时间,建议等一分钟再操作其他命令
zkServer.sh status
启动Storm
主节点Nimbus
nohup storm nimbus > nohup.out 2>&1 &
nohup storm ui > nohup.out 2>&1 &
从节点Supervisor
nohup storm supervisor > nohup.out 2>&1 &
nohup storm logviewer > nohup.out 2>&1 &
查看进程
注:jps命令需要安装JDK环境
jps
代码编写
思路:Storm集群向Kafka集群的kafkatopic发送消息,Kafka集群启动两个消费者查看是否收到消息
项目结构
项目文件
POM
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.heibaiying</groupId>
<artifactId>storm-kafka-integration</artifactId>
<version>1.0</version>
<properties>
<storm.version>1.2.2</storm.version>
<kafka.version>2.2.0</kafka.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>storm-core</artifactId>
<version>${storm.version}</version>
</dependency>
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>storm-kafka-client</artifactId>
<version>${storm.version}</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>${kafka.version}</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>8</source>
<target>8</target>
</configuration>
</plugin>
<!--使用shade进行打包-->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<configuration>
<createDependencyReducedPom>true</createDependencyReducedPom>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.sf</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.dsa</exclude>
<exclude>META-INF/*.RSA</exclude>
<exclude>META-INF/*.rsa</exclude>
<exclude>META-INF/*.EC</exclude>
<exclude>META-INF/*.ec</exclude>
<exclude>META-INF/MSFTSIG.SF</exclude>
<exclude>META-INF/MSFTSIG.RSA</exclude>
</excludes>
</filter>
</filters>
<artifactSet>
<excludes>
<exclude>org.apache.storm:storm-core</exclude>
</excludes>
</artifactSet>
</configuration>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
Spout
package write;
import org.apache.storm.shade.org.apache.commons.lang.StringUtils;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Values;
import org.apache.storm.utils.Utils;
import java.util.*;
/**
* 产生词频样本的数据源
*/
public class DataSourceSpout extends BaseRichSpout {
private List<String> list = Arrays.asList("Spark", "Hadoop", "HBase", "Storm", "Flink", "Hive");
private SpoutOutputCollector spoutOutputCollector;
@Override
public void open(Map map, TopologyContext topologyContext, SpoutOutputCollector spoutOutputCollector) {
this.spoutOutputCollector = spoutOutputCollector;
}
@Override
public void nextTuple() {
// 模拟产生数据
String lineData = productData();
spoutOutputCollector.emit(new Values("key",lineData));
System.err.println(lineData);
Utils.sleep(1000);
}
@Override
public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
outputFieldsDeclarer.declare( new Fields("key", "message"));
}
/**
* 模拟数据
*/
private String productData() {
Collections.shuffle(list);
Random random = new Random();
int endIndex = random.nextInt(list.size()) % (list.size()) + 1;
return StringUtils.join(list.toArray(), "\t", 0, endIndex);
}
}
Topology
package write;
import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.StormSubmitter;
import org.apache.storm.generated.AlreadyAliveException;
import org.apache.storm.generated.AuthorizationException;
import org.apache.storm.generated.InvalidTopologyException;
import org.apache.storm.kafka.bolt.KafkaBolt;
import org.apache.storm.kafka.bolt.mapper.FieldNameBasedTupleToKafkaMapper;
import org.apache.storm.kafka.bolt.selector.DefaultTopicSelector;
import org.apache.storm.topology.TopologyBuilder;
import java.util.Properties;
/**
* 写入数据到Kafka中
*/
public class WritingToKafkaApp {
private static final String BOOTSTRAP_SERVERS = "192.168.217.151:9092,192.168.217.152:9092";
private static final String TOPIC_NAME = "kafkatopic";
public static void main(String[] args) {
TopologyBuilder builder = new TopologyBuilder();
// 定义Kafka生产者属性
Properties props = new Properties();
/*
* 指定broker的地址清单,清单里不需要包含所有的broker地址,生产者会从给定的broker里查找其他broker的信息。
* 不过建议至少要提供两个broker的信息作为容错。
*/
props.put("bootstrap.servers", BOOTSTRAP_SERVERS);
/*
* acks 参数指定了必须要有多少个分区副本收到消息,生产者才会认为消息写入是成功的。
* acks=0 : 生产者在成功写入消息之前不会等待任何来自服务器的响应。
* acks=1 : 只要集群的首领节点收到消息,生产者就会收到一个来自服务器成功响应。
* acks=all : 只有当所有参与复制的节点全部收到消息时,生产者才会收到一个来自服务器的成功响应。
*/
props.put("acks", "all");
props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
KafkaBolt bolt = new KafkaBolt<String, String>()
.withProducerProperties(props)
.withTopicSelector(new DefaultTopicSelector(TOPIC_NAME))
.withTupleToKafkaMapper(new FieldNameBasedTupleToKafkaMapper<>());
builder.setSpout("sourceSpout", new DataSourceSpout(), 2).setNumTasks(2);
builder.setBolt("kafkaBolt", bolt, 2).shuffleGrouping("sourceSpout").setNumTasks(2);
Config config = new Config();
config.setNumWorkers(4);
config.setDebug(true);
if (args.length > 0 && args[0].equals("cluster")) {
try {
StormSubmitter.submitTopology("StormClusterWritingToKafkaClusterApp", config, builder.createTopology());
} catch (AlreadyAliveException | InvalidTopologyException | AuthorizationException e) {
e.printStackTrace();
}
} else {
LocalCluster cluster = new LocalCluster();
cluster.submitTopology("LocalWritingToKafkaApp",
config, builder.createTopology());
}
}
}
远程部署
打包
使用shade打包
上传文件
文件只需上传到Nimbus节点即可
使用Nimbus节点启动Topology
storm jar storm-kafka-integration-1.0.jar write.WritingToKafkaApp cluster
UI查看
验证测试
思路:在服务器的Kafka集群创建Kafka消费者,观察消费者能否收到Storm集群发送的消息
分别在两台Supervisor节点创建消费者,等待Storm生产者发送消息
Supervisor 1
cd /usr/local/kafka/kafka
./bin/kafka-console-consumer.sh --bootstrap-server 192.168.217.152:9092 --topic kafkatopic
Supervisor 2
cd /usr/local/kafka/kafka
./bin/kafka-console-consumer.sh --bootstrap-server 192.168.217.153:9092 --topic kafkatopic
可以看到启动消费者后,两台Kafka消费者都收到了Storm集群发来的消息了,至此,该功能已完成