前言
0.闲话少说,直接上代码
1.自定义的Kafka生产者实时向Kafka发送模拟数据;
2.Streaming使用Direct模式拉取Kafka中数据,经处理后存入HBase.
一、依赖文件(注意HBase版本对应)
<!-- 指定仓库位置,依次为aliyun、cloudera和jboss仓库 -->
<repositories>
<repository>
<id>aliyun</id>
<url>http://maven.aliyun.com/nexus/content/groups/public/</url>
</repository>
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
</repositories>
<properties>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<encoding>UTF-8</encoding>
<scala.version>2.11.8</scala.version>
<scala.compat.version>2.11</scala.compat.version>
<hadoop.version>2.7.4</hadoop.version>
<spark.version>2.2.0</spark.version>
</properties>
<dependencies>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.4</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>1.2.0-cdh5.14.0</version>
</dependency>
</dependencies>
二、Producer
package kafka_streaming_hbase
import java.util.Properties
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
/*
* @Auther Yang
* @Date 13:56 2020/10/10
* 向指定的Kafka主题发送模拟数据
*/
object ProducerTest {
def main(args: Array[String]): Unit = {
//kafka参数配置
val topic = "MyTest"
val prop = new Properties()
prop.put("bootstrap.servers", "node01:9092")
prop.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
prop.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
//创建Kafka的Producer
val producer = new KafkaProducer[String, String](prop)
val content: Array[String] = new Array[String](5)
content(0) = "apache hadoop hive"
content(1) = "sqoop flume spark"
content(2) = "hello world flink"
content(3) = "hive flume sqoop"
content(4) = "hbase spark kafka"
//随机向Kafka指定Topic发送数据
while (true) {
val i: Int = (math.random * 5).toInt
producer.send(new ProducerRecord[String, String](topic, content(i)))
println(content(i))
Thread.sleep(2000)
}
}
}
三、Consumer(自动维护偏移量版本)
package kafka_streaming_hbase
import java.util.UUID
import kafka_streaming_hbase.utils.HbaseUtils
import org.apache.hadoop.hbase.client.Put
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
/*
* @Auther Yang
* @Date 18:22 2020/10/13
* Streaming拉取Kafka数据处理写入HBase
*/
object Streaming2HBase {
def main(args: Array[String]): Unit = {
//实际开发中需要传参
/*if (args.length != 4) {
println("请输入参数:<1><2><3><4>")
System.exit(1)
}*/
//1.Streaming环境准备
val conf: SparkConf = new SparkConf().setAppName("Kafka_Streaming_HBase").setMaster("local[*]")
val sc: SparkContext = new SparkContext(conf)
sc.setLogLevel("error")
val ssc = new StreamingContext(sc, Seconds(4))
//2.准备kafka参数
val kafkaParams: Map[String, Object] = Map[String, Object](
"bootstrap.servers" -> "node01:9092", //Kafka集群地址
"key.deserializer" -> classOf[StringDeserializer], //key的反序列化类型
"value.deserializer" -> classOf[StringDeserializer], //value的反序列化类型
"group.id" -> "spark", //消费者组id
"auto.offset.reset" -> "latest", //偏移量自动重置位置
"enable.auto.commit" -> (true: java.lang.Boolean), //自动提交偏移量到Kafka的默认主题__consumer_offsets
)
val topics: Array[String] = Array("MyTest") //要订阅的主题
//3.获取Kafka数据源
val kafkaDStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe(topics, kafkaParams)
)
//4.拉取到的数据做WordCount
val word_count: DStream[(String, Int)] = kafkaDStream.flatMap(_.value().split("\\s+")).map((_, 1)).reduceByKey(_ + _)
//5.给WordCount后的数据添加UUID随机数,作为Rowkey(实际开发中Rowkey根据业务需求设计)
val input: DStream[(String, String, Int)] = word_count.map(word_one => (UUID.randomUUID().toString, word_one._1, word_one._2))
//6.数据落地,发送至HBase
input.foreachRDD(rdd => {
rdd.foreach(x => {
val put = new Put(x._1.getBytes())
put.addColumn("info".getBytes(), "word".getBytes(), x._2.getBytes())
put.addColumn("info".getBytes(), "count".getBytes(), x._3.toString.getBytes())
HbaseUtils.put("MyTest", put)
})
})
//7.启动程序,并等待关闭
ssc.start()
ssc.awaitTermination()
}
}
四、Consumer(手动维护偏移量版本)
package kafka_streaming_hbase
import java.util.UUID
import kafka_streaming_hbase.utils.HbaseUtils
import org.apache.hadoop.hbase.client.Put
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
/*
* @Auther Yang
* @Date 18:22 2020/10/13
* Streaming拉取Kafka数据处理写入HBase
*/
object Streaming2HBase2 {
def main(args: Array[String]): Unit = {
//实际开发中需要传参
/*if (args.length != 4) {
println("请输入参数:<1><2><3><4>")
System.exit(1)
}*/
//1.Streaming环境准备
val conf: SparkConf = new SparkConf().setAppName("Kafka_Streaming_HBase").setMaster("local[*]")
val sc: SparkContext = new SparkContext(conf)
sc.setLogLevel("error")
val ssc = new StreamingContext(sc, Seconds(4))
//2.准备kafka参数
val kafkaParams: Map[String, Object] = Map[String, Object](
"bootstrap.servers" -> "node01:9092", //Kafka集群地址
"key.deserializer" -> classOf[StringDeserializer], //key的反序列化类型
"value.deserializer" -> classOf[StringDeserializer], //value的反序列化类型
"group.id" -> "spark", //消费者组id
"auto.offset.reset" -> "latest", //偏移量自动重置位置
"enable.auto.commit" -> (false: java.lang.Boolean) //自动提交偏移量到Kafka的默认主题__consumer_offsets中
//"auto.commit.interval.ms" -> "1000" //自动提交的时间间隔
)
val topics: Array[String] = Array("MyTest") //要订阅的主题
//3.获取Kafka数据源
val kafkaDStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe(topics, kafkaParams)
)
//4.拉取到的数据做WordCount
kafkaDStream.foreachRDD(rdd => {
if (rdd.count() > 0) {
val word_count: RDD[(String, Int)] = rdd.flatMap(_.value().split("\\s+"))
.map((_, 1))
.reduceByKey(_ + _)
val result: RDD[(String, String, Int)] = word_count.map(x => (UUID.randomUUID().toString, x._1, x._2))
result.foreach(x => {
val put = new Put(x._1.getBytes())
put.addColumn("info".getBytes(), "word".getBytes(), x._2.getBytes())
put.addColumn("info".getBytes(), "word".getBytes(), x._3.toString.getBytes())
HbaseUtils.put("MyTest", put)
})
//4.1 获取该批次rdd的偏移量
val offsetRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
//4.2 提交该批次的偏移量
kafkaDStream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
}
})
//7.启动程序,并等待关闭
ssc.start()
ssc.awaitTermination()
}
}
五、附HBase工具类
package kafka_streaming_hbase.utils;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* Hbase 操作工具类
*
* @author yang
* @version 1.0.0
* @createDate 2019-05-03
*
*/
public class HbaseUtils {
// ===============Common=====================================
/**
* 根据表名获取Table对象
*
* @param name 表名,必要时可指定命名空间,比如:“default:user”
* @return Hbase Table 对象
* @throws IOException 有异常抛出,由调用者捕获处理
*/
public static Table getTable(String name) throws IOException {
TableName tableName = TableName.valueOf(name);
Connection connection = ConnectionFactory.createConnection();
return connection.getTable(tableName);
}
// =============== Put =====================================
/**
* 根据rowKey生成一个Put对象
*
* @param rowKey rowKey
* @return Put对象
*/
public static Put createPut(String rowKey) {
return new Put(Bytes.toBytes(rowKey));
}
/**
* 在Put对象上增加Cell
*
* @param put Put对象
* @param cell cell对象
* @throws IOException 有异常抛出,由调用者捕获处理
*/
public static void addCellOnPut(Put put, Cell cell) throws IOException {
put.add(cell);
}
/**
* 在Put对象上增加值
*
* @param put Put对象
* @param family 列簇
* @param qualifier 列
* @param value 字符串类型的值
*/
public static void addValueOnPut(Put put, String family, String qualifier, String value) {
addValueOnPut(put, family, qualifier, Bytes.toBytes(value));
}
/**
* 在Put对象上增加值
*
* @param put Put对象
* @param family 列簇
* @param qualifier 列
* @param value 字节数组类型的值,可以是任意对象序列化而成
*/
public static void addValueOnPut(Put put, String family, String qualifier, byte[] value) {
put.addColumn(Bytes.toBytes(family), Bytes.toBytes(qualifier), value);
}
/**
* 在Put对象上增加值
*
* @param put Put对象
* @param family 列簇
* @param qualifier 列
* @param ts Timestamp时间戳
* @param value 字符串类型的值
*/
public static void addValueOnPut(Put put, String family, String qualifier, long ts, String value) {
addValueOnPut(put, family, qualifier, ts, Bytes.toBytes(value));
}
/**
* 在Put对象上增加值
*
* @param put Put对象
* @param family 列簇
* @param qualifier 列
* @param ts Timestamp时间戳
* @param value 字节数组类型的值,可以是任意对象序列化而成
*/
public static void addValueOnPut(Put put, String family, String qualifier, long ts, byte[] value) {
put.addColumn(Bytes.toBytes(family), Bytes.toBytes(qualifier), ts, value);
}
/**
* 按表名插入一个Put对象包含的数据
*
* @param tableName 表名,必要时可指定命名空间,比如:“default:user”
* @param put 要插入的数据对象
* @throws IOException 有异常抛出,由调用者捕获处理
*/
public static void put(String tableName, Put put) throws IOException {
try (
Table table = getTable(tableName);
) {
table.put(put);
}
}
/**
* 按表名批量插入Put对象包含的数据
*
* @param tableName 表名,必要时可指定命名空间,比如:“default:user”
* @param puts 要插入的数据对象集合
* @throws IOException 有异常抛出,由调用者捕获处理
*/
public static void put(String tableName, List<Put> puts) throws IOException {
try (
Table table = getTable(tableName);
) {
table.put(puts);
}
}
// =============== Get =====================================
/**
* 根据rowKey生成一个查询的Get对象
*
* @param rowKey rowKey
* @return Get 对象
*/
public static Get createGet(String rowKey) {
return new Get(Bytes.toBytes(rowKey));
}
/**
* 对查询的Get对象增加指定列簇
*
* @param get
* @param family
*/
public static void addFamilyOnGet(Get get, String family) {
get.addFamily(Bytes.toBytes(family));
}
/**
* 对查询的Get对象增加指定列簇和列
*
* @param get
* @param family
* @param qualifier
*/
public static void addColumnOnGet(Get get, String family, String qualifier) {
get.addColumn(Bytes.toBytes(family), Bytes.toBytes(qualifier));
}
/**
* 根据表名和rowKey查询结果(包含全部列簇和列)
*
* @param tableName 表名,必要时可指定命名空间,比如:“default:user”
* @param rowKey 查询rowKey
* @return 查询结果Result
* @throws IOException 有异常抛出,由调用者捕获处理
*/
public static Result get(String tableName, String rowKey) throws IOException {
Get get = createGet(rowKey);
return get(tableName, get);
}
/**
* 根据表名和rowKey数组批量查询结果(包含全部列簇和列)
*
* @param tableName 表名,必要时可指定命名空间,比如:“default:user”
* @param rowKeys 查询rowKey数组
* @return 查询结果Result数组
* @throws IOException 有异常抛出,由调用者捕获处理
*/
public static Result[] get(String tableName, String[] rowKeys) throws IOException {
List<Get> gets = new ArrayList<Get>();
for (String rowKey : rowKeys) {
gets.add(createGet(rowKey));
}
return get(tableName, gets);
}
/**
* 根据表名和Get对象查询结果
*
* @param tableName 表名,必要时可指定命名空间,比如:“default:user”
* @param get Hbase查询对象
* @return 查询结果Result
* @throws IOException 有异常抛出,由调用者捕获处理
*/
public static Result get(String tableName, Get get) throws IOException {
try (
Table table = getTable(tableName);
) {
return table.get(get);
}
}
/**
* 根据表名和Get对象数组查询结果
*
* @param tableName 表名,必要时可指定命名空间,比如:“default:user”
* @param gets 多个Hbase查询对象组成的数组
* @return 查询结果Result数组
* @throws IOException 有异常抛出,由调用者捕获处理
*/
public static Result[] get(String tableName, List<Get> gets) throws IOException {
try (
Table table = getTable(tableName);
) {
return table.get(gets);
}
}
// =============== Scan =====================================
/**
* 根据startRow和stopRow创建扫描对象
*
* @param startRow 扫描开始行,结果包含该行
* @param stopRow 扫描结束行,结果不包含该行
* @return Scan对象
*/
/*public static Scan createScan(String startRow, String stopRow) {
Scan scan = new Scan();
scan.withStartRow(Bytes.toBytes(startRow));
scan.withStopRow(Bytes.toBytes(stopRow));
return scan;
}*/
/**
* 对扫描对象设置列簇
*
* @param scan 扫描对象
* @param family 列簇
*/
public static void addFamilyOnScan(Scan scan, String family) {
scan.addFamily(Bytes.toBytes(family));
}
/**
* 对扫描对象设置列
*
* @param scan 扫描对象
* @param family 列簇
* @param qualifier 列簇下对应的列
*/
public static void addColumnOnScan(Scan scan, String family, String qualifier) {
scan.addColumn(Bytes.toBytes(family), Bytes.toBytes(qualifier));
}
/**
* 根据表名和扫描对象扫描数据
*
* @param tableName 表名,必要时可指定命名空间,比如:“default:user”
* @param scan 扫描对象
* @return 扫描结果集对象ResultScanner
* @throws IOException 有异常抛出,由调用者捕获处理
*/
public static ResultScanner scan(String tableName, Scan scan) throws IOException {
try (
Table table = getTable(tableName);
) {
return table.getScanner(scan);
}
}
/**
* 根据表名、开始行和结束行扫描数据(结果包含开始行,不包含结束行,半开半闭区间[startRow, stopRow))
*
* @param tableName 表名,必要时可指定命名空间,比如:“default:user”
* @param startRow 扫描开始行
* @param stopRow 扫描结束行
* @return 扫描结果集对象ResultScanner
* @throws IOException 有异常抛出,由调用者捕获处理
*/
/*public static ResultScanner scan(String tableName, String startRow, String stopRow) throws IOException {
return scan(tableName, createScan(startRow, stopRow));
}*/
// =============== Delete =====================================
/**
* 根据rowKey生成一个查询的Delete对象
*
* @param rowKey rowKey
* @return Delete对象
*/
public static Delete createDelete(String rowKey) {
return new Delete(Bytes.toBytes(rowKey));
}
/**
* 在Delete对象上增加Cell
*
* @param delete Delete对象
* @param cell cell对象
* @throws IOException 有异常抛出,由调用者捕获处理
*/
/*public static void addCellOnDelete(Delete delete, Cell cell) throws IOException {
delete.add(cell);
}*/
/**
* 对删除对象增加指定列簇
*
* @param delete Delete对象
* @param family 列簇
*/
public static void addFamilyOnDelete(Delete delete, String family) {
delete.addFamily(Bytes.toBytes(family));
}
/**
* 对删除对象增加指定列簇和列
*
* @param delete Delete对象
* @param family 列簇
* @param qualifier 列
*/
public static void addColumnOnDelete(Delete delete, String family, String qualifier) {
delete.addColumn(Bytes.toBytes(family), Bytes.toBytes(qualifier));
}
/**
* 按表名删除一个Delete对象指定的数据
*
* @param tableName 表名,必要时可指定命名空间,比如:“default:user”
* @param delete Delete对象
* @throws IOException 有异常抛出,由调用者捕获处理
*/
public static void delete(String tableName, Delete delete) throws IOException {
try (
Table table = getTable(tableName);
) {
table.delete(delete);
}
}
/**
* 按表名批量删除Delete对象集合包含的指定数据
*
* @param tableName 表名,必要时可指定命名空间,比如:“default:user”
* @param deletes Delete对象集合
* @throws IOException 有异常抛出,由调用者捕获处理
*/
public static void delete(String tableName, List<Delete> deletes) throws IOException {
try (
Table table = getTable(tableName);
) {
table.delete(deletes);
}
}
}
注意:
1.HBase先建表:
create 'MyTest','info'
2.HBase-Client的依赖注意对应版本,版本不对会报反序列化异常
3.启动顺序zookeeper->hdfs->kafka->HBase,使用HBase shell管理查看HBase