运行环境:
三台Ubuntu16.04
Storm 1.2.2
Hbase 1.4.9
Kafka 2.11-2.1.1
所需的依赖:
<dependencies>
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>storm-core</artifactId>
<version>1.1.1</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>0.11.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>0.96.1-hadoop2</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<archive>
<manifest>
<!-- 此处指定main方法入口的class -->
<mainClass>com.jinxLbj.MyTopology</mainClass>
</manifest>
</archive>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<!-- bind to the packaging phase -->
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
首先我们如果想进行流处理,我们必须要去拥有一个数据源,这里我们使用Kafka作为消息队列,模拟不断流入的数据
1、Kafka
生产者(向消息队列无限提交数据):
public class MyProducer {
public static void main(String[] args) throws InterruptedException {
Properties properties = new Properties();
//Zookeeper 地址与端口
properties.put("bootstrap.servers", "10.96.129.36:9092");
properties.put("acks", "all");
properties.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
properties.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
Producer<String, String> producer = new KafkaProducer<String, String>(properties);
while(true) {
//模拟发送以空格分割的随机数字字符串
producer.send(new ProducerRecord<String, String>("IntCount",
String.valueOf(new Random().nextInt(10)) + " "
+ String.valueOf(new Random().nextInt(10)) + " "
+ String.valueOf(new Random().nextInt(10)) + " "
+ String.valueOf(new Random().nextInt(10)) + " "
+ String.valueOf(new Random().nextInt(10)) + " "
+ String.valueOf(new Random().nextInt(10)) + " "
+ String.valueOf(new Random().nextInt(10)) + " "
+ String.valueOf(new Random().nextInt(10)) + " "
+ String.valueOf(new Random().nextInt(10)) + " "
+ String.valueOf(new Random().nextInt(10)) + " "
+ String.valueOf(new Random().nextInt(10)) + " " )
);
Thread.sleep(1000);
}
}
}
消费者(接受消息队列的数据,提交到Storm):
public class MyConsumer {
private static KafkaConsumer kafkaConsumer;
private static Thread myThread;
public static void init(final String topic) {
Properties properties = new Properties();
//此为Kafka所安装Zookeeper的端口
properties.put("bootstrap.servers", "10.96.129.36:9092");
properties.put("group.id", "group-1");
properties.put("enable.auto.commit", "true");
properties.put("auto.commit.interval.ms", "1000");
//当各分区下有已提交的offset时,从提交的offset开始消费;无提交的offset时,从头开始消费
properties.put("auto.offset.reset", "earliest");
//设置超时时间30秒
properties.put("session.timeout.ms", "30000");
//设置序列化与反序列化
properties.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
properties.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
kafkaConsumer = new KafkaConsumer<String, String>(properties);
//开启一个新的进程,用来模拟循环从队列中取数据
myThread = new Thread(new Runnable() {
public void run() {
//topic为所订阅的主题
kafkaConsumer.subscribe(Arrays.asList(topic));
while (true) {
ConsumerRecords<String, String> records = kafkaConsumer.poll(100);
for (ConsumerRecord<String, String> record : records) {
MySpout.messages.push(record.value());
}
}
}
});
myThread.start();
}
public static void close() {
kafkaConsumer.close();
}
}
声明数据处理过后的去处
2、Hbase
public class MyHBase {
private static HBaseAdmin admin = null;
private static Configuration configuration = null;
//初始化Hbase数据
public static void init() throws IOException {
configuration = HBaseConfiguration.create();
//此为Hadoop分布式存储hbase元数据的位置
configuration.set("hbase.rootdir", "hdfs://master:9000/opt/hbase/hbase_db");
//此为zookeeper的自治
configuration.set("hbase.zookeeper.quorum", "10.96.129.36, 10.96.129.41, 10.96.129.43");
admin = new HBaseAdmin(configuration);
}
//创建表
public static void createTable(String tableName, String[] strs) throws IOException {
if (!admin.tableExists(tableName)) {
HColumnDescriptor[] hcDes = new HColumnDescriptor[strs.length];
for (int i = 0; i < strs.length; i++) {
hcDes[i] = new HColumnDescriptor(strs[i]);
}
HTableDescriptor tblDes = new HTableDescriptor(TableName.valueOf(tableName));
for (HColumnDescriptor hc : hcDes) {
tblDes.addFamily(hc);
}
admin.createTable(tblDes);
System.out.println(tableName + " create successfully!");
} else {
System.out.println(tableName + " already existed!");
}
}
//添加或修改数据
public static boolean put(String tableName, String rowkey, String columnFamily, String qualifier, String value) {
try {
HTable table = new HTable(configuration, tableName);
Put put = new Put(rowkey.getBytes());
put.add(columnFamily.getBytes(), qualifier.getBytes(), value.getBytes());
table.put(put);
System.out.println("put successfully! " + rowkey + "," + columnFamily + "," + qualifier + "," + value);
} catch (IOException e) {
e.printStackTrace();
return false;
}
return true;
}
// 查询
public static Result getResult(String tableName, String rowkey) {
System.out.println("get result. table=" + tableName + " rowkey=" + rowkey);
try {
HTable table = new HTable(configuration, tableName);
Get get = new Get(rowkey.getBytes());
return table.get(get);
} catch (IOException e) {
e.printStackTrace();
return null;
}
}
// Result转换成Map形式,便于输出
private static Map<String, Object> result2Map(Result result) {
Map<String, Object> ret = new HashMap<String, Object>();
if (result != null && result.listCells() != null) {
for (Cell cell : result.listCells()) {
String key = Bytes.toString(CellUtil.cloneQualifier(cell));
String value = Bytes.toString(CellUtil.cloneValue(cell));
System.out.println(key + " => " + value);
ret.put(key, value);
}
}
return ret;
}
public static void scan(String tableName) {
System.out.println("scan table " + tableName);
try {
HTable table = new HTable(configuration, tableName);
Scan scan = new Scan();
ResultScanner rs = table.getScanner(scan);
Integer total = 0;
for (Result r : rs) {
String num = Bytes.toString(r.getValue(Bytes.toBytes("num"), Bytes.toBytes("num1")));
System.out.println(Bytes.toString(r.getRow()) + " -> " +
num);
total += Integer.valueOf(num);
}
System.out.println("total = " + total);
} catch (IOException e) {
e.printStackTrace();
}
}
// 删除表
public static boolean delete(String tableName) {
System.out.println("delete table " + tableName);
try {
HBaseAdmin admin = new HBaseAdmin(configuration);
if (admin.tableExists(tableName)) {
admin.disableTable(tableName);
admin.deleteTable(tableName);
}
return true;
} catch (IOException e) {
e.printStackTrace();
return false;
}
}
public static void main(String[] args) throws IOException {
init();
scan("IntCount");
}
}
当拿到数据并且声明好数据处理过后的去处,就应该开始着手操作数据了,如下是Storm代码段
3、Storm
MySpout(整个数据处理的数据源,接受从Kafka中拿到的数据,提交进行数据处理)
public class MySpout extends BaseRichSpout {
private static final long serialVersionUID = 225243592780939490L;
private SpoutOutputCollector collector;
private static final String field = "word";
private int count = 0;
//数据栈,Kafka的消费者类获取到数据即将数据压入栈
public static Stack<String> messages = new Stack<String>();
//初始化方法,在初始化方法中初始化Kafka的消费者类,初始化Hbase,方便数据处理后存入
public void open(Map map, TopologyContext topologyContext, SpoutOutputCollector spoutOutputCollector) {
this.collector = spoutOutputCollector;
//初始化消费者类,IntCount为主题名
MyConsumer.init("IntCount");
try {
//初始化Hbase(代码在后方)
MyHBase.init();
//在Hbase创建一个名为IntCount的表,包括列簇 "num"
MyHBase.createTable("IntCount", new String[]{"num"});
} catch (IOException e) {
e.printStackTrace();
}
}
//在此处将数据发送给而后的处理类
public void nextTuple() {
if (!messages.empty()) {
String msg = messages.pop();
this.collector.emit(new Values(msg));
}
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
System.out.println("定义格式...");
//定义输出数据的格式
declarer.declare(new Fields(field));
}
@Override
public void fail(Object obj) {
System.out.println("失败:" + obj);
}
@Override
public void close() {
System.out.println("关闭...");
}
}
MyBolt(从MySpout中获取数据,而后对数据进行处理)
public class MyBolt extends BaseRichBolt {
private static final long serialVersionUID = 4743224635827696343L;
private OutputCollector collector;
//MyBolt的初始化方法
public void prepare(Map map, TopologyContext topologyContext, OutputCollector outputCollector) {
this.collector = outputCollector;
}
//数据处理(核心)
public void execute(Tuple tuple) {
//在Spout中我们定义了数据格式只有一个word,所以我们拿到Tuple的word
//此word应该为一个被空格分割的若干数字
String msg = tuple.getStringByField("word");
//将数字分割
String[] words = msg.toLowerCase().split(" ");
//将分割好的数据继续发射到下一个处理类进行处理(即为Bolt2)
for (String word : words) {
System.out.println("split " + word);
this.collector.emit(new Values(word));//向下一个bolt发射数据
}
}
//若还有数据输出,则规定数据的格式
public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
outputFieldsDeclarer.declare(new Fields("word"));
}
@Override
public void cleanup() {
System.out.println("资源释放");
}
}
Bolt2(对数字进行分割之后,要对数字进行统计,此类用来做这种工作)
public class MyBolt2 extends BaseRichBolt {
private static final long serialVersionUID = 4743224635827696343L;
private OutputCollector collector;
//初始化
public void prepare(Map map, TopologyContext topologyContext, OutputCollector outputCollector) {
this.collector = outputCollector;
}
public void execute(Tuple tuple) {
//此处msg应为被Bolt类处理过后的单独数字
String msg = tuple.getStringByField("word");
//我们从Hbase中的IntCount表,查询以此数字为行标的记录
Result r = MyHBase.getResult("IntCount", msg);
assert r != null;
//获取此数字已经出现的次数
String numS = Bytes.toString(r.getValue(Bytes.toBytes("num"), Bytes.toBytes("num1")));
//若数字为空,则认为这是第一次出现,将结果存入Hbase
if (numS == null || "".equals(numS)) {
MyHBase.put("IntCount", msg, "num", "num1", "1");
} else {
//若数字不为空,则自增,存入Hbase
Integer num = Integer.valueOf(numS);
MyHBase.put("IntCount", msg, "num", "num1", String.valueOf(num + 1));
}
}
//如还有后续操作,声明格式,此Demo已无后续操作
public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
outputFieldsDeclarer.declare(new Fields("word"));
}
@Override
public void cleanup() {
System.out.println("资源释放");
}
}
MyTopology(我们将刚刚写成的数据处理,经过这个类组成一个流程)
public class MyTopology {
public static void main(String[] args) {
// TODO Auto-generated method stub
//定义一个拓扑
TopologyBuilder builder = new TopologyBuilder();
//设置一个Executeor(线程),默认一个
builder.setSpout("spout1", new MySpout());
//设置一个Executeor(线程),和一个task
//bolt1从spout1获取数据
builder.setBolt("bolt1", new MyBolt(), 1).setNumTasks(1).shuffleGrouping("spout1");
//bolt2从bolt1获取数据
builder.setBolt("bolt2", new MyBolt2(), 1).setNumTasks(1).fieldsGrouping("bolt1", new Fields("word"));
Config conf = new Config();
conf.put("test", "test");
try {
System.out.println("远程模式");
StormSubmitter.submitTopology(args[0], conf, builder.createTopology());
} catch (Exception e) {
e.printStackTrace();
}
}
}
四、运行
打包此项目,上传到安装Storm的服务器上。运行命令
storm jar jar包位置 入口类 自定义进程名
例:storm jar storm_study_1.jar com.jinxLbj.MyTopology IntCount
而后应该显示Finished submitting topology: IntCount
五、测试
运行最上方Kafka的生产者的main方法
等待几秒
运行HBase的查询方法,查看结果