重要备注:1、整个java代码编写使用到的工具是 IntelliJ IDEA
2、前提是kafka、zookeeper、storm集群环境已经能够使用
3、特别注意本地pom文件中版本号一定要与集群环境的版本号对应
代码编写:
首先是pom文件:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.daqsoft</groupId>
<artifactId>kafka-demo</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>
<name>kafka-demo</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>storm-core</artifactId>
<version>1.1.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.44</version>
</dependency>
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>storm-kafka</artifactId>
<version>1.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>0.10.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_2.11</artifactId>
<version>0.10.1.0</version>
<exclusions>
<exclusion>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
</exclusion>
<exclusion>
<groupId>com.101tec</groupId>
<artifactId>zkclient</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>log4j-over-slf4j</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
</dependencies>
<!-- 打包配置 -->
<build>
<plugins>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.6</source>
<target>1.6</target>
</configuration>
</plugin>
</plugins>
</build>
</project>
上面需要注意两点:
a. artifactId值为storm-core的配置:<scope>provided</scope>
这行代码在打包时,放开,项目在本地工具中运行时,需要注释掉。
b.注意 配置中的 exclusions排除依赖的配置
创建4个类:
WordCountBoltCount
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import java.util.HashMap;
import java.util.Map;
/*
* 第二个bolt组件,用于接收前一个 bolt组件分词后的数据,然后进行单词的计数
*/
public class WordCountBoltCount extends BaseRichBolt {
//定义一个Map集合保存最后结果
private Map<String,Integer> result = new HashMap<String,Integer>();
private OutputCollector collector;
@Override
public void execute(Tuple tuple) {
// 从tuple中取出发送来的数据
String word = tuple.getStringByField("word");
int count = tuple.getIntegerByField("count");
//判断一个result是否存在该单词
if(result.containsKey(word)){
//包含该单词
int total = result.get(word);
result.put(word, total+count);
}else{
//不存在该单词
result.put(word, count);
}
//输出结果
System.out.println("最后结果输出============"+ result);
//将处理后的结果发送给下一个Bolt组件,然后插入数据库
//this.collector.emit(new Values(word,result.get(word)));
}
@Override
public void prepare(Map arg0, TopologyContext arg1, OutputCollector collector) {
this.collector = collector;
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
// 申明输出数据的schema: Beijing 2
declarer.declare(new Fields("word","total"));
}
}
WordCountSplitBolt
import java.util.Map;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
/*
* 第一个Bolt组件:用于接收spout组件发送的数据,然后进行单词的拆分
*/
public class WordCountSplitBolt extends BaseRichBolt{
private OutputCollector collector;
@Override
public void execute(Tuple tuple) {
// 如何在bolt组件中处理数据(从spout发送过来)
// I love Beijing
String sentence = tuple.getStringByField("str");
System.out.println("获取到的数据为:================================" + sentence);
//分词
String[] words = sentence.split(" ");
//输出数据
for(String word:words){
this.collector.emit(new Values(word,1));
}
}
@Override
public void prepare(Map arg0, TopologyContext arg1, OutputCollector collector) {
// 对该bolt组件进行初始化
this.collector = collector;
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
// 申明本组件输出数据tuple的schema是什么
// I 1
// love 1
// Beijing 1
declarer.declare(new Fields("word","count"));
}
}
WordCountSpout
import java.util.Map;
import java.util.Random;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Values;
import org.apache.storm.utils.Utils;
/*
* Spout组件是整个Topology的数据源,负责从外部接收数据
*/
public class WordCountSpout extends BaseRichSpout {
//定义输出的变量用于向下一个组件输出数据
private SpoutOutputCollector collector;
//模拟一些数据
private static String[] data = {"I love Beijing","I love China","Beijing is the capital of China"};
@Override
public void nextTuple() {
//让当前线程睡一段时间
Utils.sleep(3000);
//由Storm框架进行调用,用于接收外部发送的数据
//产生一个随机数
int random = (new Random()).nextInt(3);
//模拟产生数据
String sentence = data[random];
//将数据发送给下一个组件bolt进行单词的拆分
this.collector.emit(new Values(sentence));
//输出发送的数据
System.out.println("Spout采集的数据是:"+ sentence);
}
@Override
public void open(Map arg0, TopologyContext arg1, SpoutOutputCollector collector) {
//对spout组件进行初始化
this.collector = collector;
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
//申请输出tuple的schema
declarer.declare(new Fields("sentence"));
}
}
WordCountTopology
import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.StormSubmitter;
import org.apache.storm.generated.StormTopology;
import org.apache.storm.kafka.BrokerHosts;
import org.apache.storm.kafka.KafkaSpout;
import org.apache.storm.kafka.SpoutConfig;
import org.apache.storm.kafka.StringScheme;
import org.apache.storm.kafka.ZkHosts;
import org.apache.storm.spout.SchemeAsMultiScheme;
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.tuple.Fields;
import java.util.Arrays;
/*
* 单词计数的topology的入口,主程序
*/
public class WordCountTopology {
public static void main(String[] args) throws Exception {
TopologyBuilder builder = new TopologyBuilder();
//配置zk 集群.
BrokerHosts zkHosts = new ZkHosts("192.168.2.100:2181");
//初始化配置信息
SpoutConfig spoutConfig = new SpoutConfig(zkHosts,
"kafkaTest",
"/kafkaTest",
"kafka_test_storm1");
//zkServers 列表
spoutConfig.zkServers = Arrays.asList("192.168.2.100".toString().split(","));
//zk 端口
spoutConfig.zkPort = 2181;
//解析数据成 string 类型数据 对接kafkaspout 可以通过 getStringByField("str") 获取 也可以通过 (String)getValue(0); 获取
spoutConfig.scheme = new SchemeAsMultiScheme(new StringScheme());
spoutConfig.startOffsetTime = kafka.api.OffsetRequest.EarliestTime();
KafkaSpout kafkaSpout = new KafkaSpout(spoutConfig);
//设置任务的spout组件
// builder.setSpout("wordcount_spout", new WordCountSpout());
//数据源
builder.setSpout("wordcount_spout", kafkaSpout, 3);
//设置任务的第一个bolt组件
builder.setBolt("wordcount_splitbolt", new WordCountSplitBolt())
.shuffleGrouping("wordcount_spout");
//设置任务的第二个bolt组件
builder.setBolt("wordcount_countbolt", new WordCountBoltCount())
.fieldsGrouping("wordcount_splitbolt", new Fields("word"));
//创建Topology任务
StormTopology wc = builder.createTopology();
//配置参数信息
Config conf = new Config();
if (args != null && args.length > 0) {
conf.setNumWorkers(3);
try {
StormSubmitter.submitTopology(args[0], conf, wc);
} catch (Exception e) {
e.printStackTrace();
}
} else {
//设置成 toplic 分区数
conf.setNumWorkers(3);
LocalCluster cluster = new LocalCluster();
cluster.submitTopology("otaTopology", conf, wc);
}
}
}
接下来就是最坑的地方:打包
以下以截图方式说明:
对项目的Project Structure进行配置:
确定后,如下图
确认后,进行下一步打包操作:
点击后进入下图:
打包完成后,打完的包需要到项目配置的打包输出路径下查看:
架包通过ftp上传到服务器的目录下
然后进行服务器storm命令执行topology任务:
在xshell上,进入到架包所在目录
执行命令:
storm-1.1.0/bin/storm jar kafkademo.jar com.daqsoft.kafka.WordCountTopology tests
对上面命令说明:
storm-1.1.0/bin/storm jar storm执行jar命令,注意 不是 -jar
kafkademo.jar 架包名称
com.daqsoft.kafka.WordCountTopology 指定的是 需要执行main的类的路径
tests 指的是提交的 topology任务的名称