一、zk:
-
-
import cn.qphone.spark.`
trait`.LoggerTrait
-
import cn.qphone.spark.utils.{
CommonUtils,
SparkUtils}
-
import kafka.common.
TopicAndPartition
-
import kafka.message.
MessageAndMetadata
-
import kafka.serializer.
StringDecoder
-
import org.apache.curator.framework.
CuratorFrameworkFactory
-
import org.apache.curator.retry.
ExponentialBackoffRetry
-
import org.apache.spark.streaming.
StreamingContext
-
import org.apache.spark.streaming.dstream.
InputDStream
-
import org.apache.spark.streaming.kafka.{
HasOffsetRanges,
KafkaUtils,
OffsetRange}
-
-
import scala.collection.{
JavaConversions, mutable}
-
-
object Demo6_SparkStreaming_Kafka_Zookeeper extends LoggerTrait{
-
-
// zookeeper的客户端
-
val client = {
-
val client =
CuratorFrameworkFactory.builder()
-
.connectString(
"hbase1,hbase2,hbase3")
-
.retryPolicy(
new
ExponentialBackoffRetry(
1000,
3))
-
.namespace(
"kafka/consumers/offsets")
-
.build()
-
client.start()
-
client
-
}
-
-
-
def main(args:
Array[
String]):
Unit = {
-
//1. 入口
-
val ssc =
SparkUtils.getLocalStreamingContext(
"Demo6_SparkStreaming_Kafka_Zookeeper",
2)
-
val kafkaParams =
CommonUtils.toMap(
"demo6.properties")
-
val topics =
"bjbigdata1909-1".split(
",").toSet
-
-
//2. 加载数据
-
val messages:
InputDStream[(
String,
String)] = createMsg(ssc, kafkaParams, topics)
-
-
//3. 遍历消息
-
messages.foreachRDD((rdd, btime) => {
-
if (!rdd.isEmpty()) {
-
//3.1 将偏移量读取到东西打印
-
println(
"-"*
100)
-
println(
s"bTime = ${btime}")
-
println(
"#"*
50 +
" " + rdd.count())
-
//3.2 保存最新的偏移量到zookeeper
-
storeOffsets(rdd.asInstanceOf[
HasOffsetRanges].offsetRanges, kafkaParams(
"group.id"))
-
}
-
})
-
-
ssc.start()
-
ssc.awaitTermination()
-
}
-
-
/**
-
* 从zk中读取手动保存offset信息,然后从kafka指定offset位置开始读取数据,如果没有读取到offset信息,那么从开始位置开始读取信息
-
*/
-
def createMsg(ssc:
StreamingContext, kafkaParams:
Map[
String,
String], topics:
Set[
String]):
InputDStream[(
String,
String)] = {
-
//1. 从zookeeper中读取offset信息
-
val fromOffsets:
Map[
TopicAndPartition,
Long] = getFromOffsets(topics, kafkaParams(
"group.id"))
-
//2. 读取外部数据
-
var messages:
InputDStream[(
String,
String)] =
null
-
//2.1 判断
-
if (fromOffsets.isEmpty) {
// 如果没有读取到偏移量,说明之前从来没有保存过,从开始的位置开始读取
-
messages =
KafkaUtils.createDirectStream[
String,
String,
StringDecoder,
StringDecoder](ssc, kafkaParams, topics)
-
}
else {
//读取到了偏移量,从指定位置开始读取
-
//2.2 创建messageHandler
-
val messageHandler = (msgHandler:
MessageAndMetadata[
String,
String]) => (msgHandler.key(), msgHandler.message())
-
//2.3 读取指定位置的offset的数据
-
messages =
KafkaUtils.createDirectStream[
String,
String,
StringDecoder,
StringDecoder, (
String,
String)](ssc, kafkaParams, fromOffsets, messageHandler)
-
}
-
messages
-
}
-
-
/**
-
* 根据主题和消费者组来获取到对应的偏移量
-
* 首先我们定义了offset保存在zookeeper的哪一个目录下:/kafka/topic/group/partition
-
* 其次,partition中保存了的数据就是我们的offset
-
*/
-
def getFromOffsets(topics:
Set[
String], group:
String) :
Map[
TopicAndPartition,
Long] = {
-
//1. 定义一个结构专门保存偏移量
-
val offsets = mutable.
Map[
TopicAndPartition,
Long]()
-
//2. 遍历主题
-
for(topic <- topics) {
-
//2.1 自定义offset在zookeeper的位置
-
val path =
s"${topic}/${group}"
-
//2.2 判断zookeeper中此path路径是否存在
-
isExists(path)
-
//2.3 遍历获取分区:还需要将java的数组转换位scala的数组
-
for(partition <-
JavaConversions.asScalaBuffer(client.getChildren.forPath(path))) {
-
//2.3.1 这个路径是用来保存偏移量
-
val fullPath =
s"${path}/${partition}"
-
//2.3.2 获取偏移量
-
val offset =
new
String(client.getData.forPath(fullPath)).toLong
-
//2.3.3 数据保存offsets
-
offsets.put(
TopicAndPartition(topic, partition.toInt), offset)
-
}
-
}
-
offsets.toMap
-
}
-
-
/**
-
* 判断节点是否存在,如果不存在就创建之
-
*/
-
def isExists(path:
String):
Unit = {
-
if (client.checkExists().forPath(path) ==
null) {
// 如果路径不存在
-
client.create().creatingParentsIfNeeded().forPath(path)
-
}
-
}
-
-
/**
-
* 将偏移量保存会zookeeper
-
*/
-
def storeOffsets(offsetRanges:
Array[
OffsetRange], group:
String) = {
-
//1. 遍历偏移量范围的数组
-
for(offsetRange <- offsetRanges) {
-
//2. 获取主题分区以及偏移量
-
val topic = offsetRange.topic
-
val partition = offsetRange.partition
-
val untilOffset = offsetRange.untilOffset
-
//3. 创建保存在zookeeper上的目录
-
val path =
s"${topic}/${group}/${partition}"
-
isExists(path)
-
//4. 保存偏移量到partition
-
client.setData().forPath(path, untilOffset.toString.getBytes())
-
}
-
}
-
}
二、HBase保存
1.工具类
-
-
import org.apache.hadoop.conf.Configuration;
-
import org.apache.hadoop.hbase.Cell;
-
import org.apache.hadoop.hbase.CellUtil;
-
import org.apache.hadoop.hbase.HBaseConfiguration;
-
import org.apache.hadoop.hbase.TableName;
-
import org.apache.hadoop.hbase.client.*;
-
import org.apache.hadoop.hbase.filter.BinaryComparator;
-
import org.apache.hadoop.hbase.filter.CompareFilter;
-
import org.apache.hadoop.hbase.filter.Filter;
-
import org.apache.hadoop.hbase.filter.RowFilter;
-
import org.junit.Test;
-
-
import java.io.IOException;
-
import java.util.HashMap;
-
import java.util.LinkedList;
-
import java.util.List;
-
import java.util.Map;
-
import java.util.concurrent.ExecutionException;
-
-
/**
-
* HBase连接的客户端工具类
-
*/
-
public
class HBaseUtils {
-
-
// pool就是存放hbase的连接对象的连接池
-
private
static
LinkedList<
Connection> pool = new
LinkedList<>();
-
-
// 初始化5条hbase的连接对象到连接池中
-
static {
-
try {
-
Configuration conf = new
Configuration();
-
conf.
set(
"hbase.rootdir",
"hdfs://hbase1:9000");
-
conf.
set(
"hbase.cluster.distributed",
"true");
-
conf.
set(
"hbase.zookeeper.quorum",
"hbase1,hbase2,hbase3");
-
conf.
set(
"hbase.regionserver.wal.codec",
"org.apache.hadoop.hbase.regionserver.wal.IndexedWALEditCodec");
-
for (int i =
0;i <
5;i++) {
-
pool.push(
ConnectionFactory.createConnection(conf));
-
}
-
}
catch (
Exception e) {
-
e.printStackTrace();
-
}
-
}
-
-
// 获取连接对象
-
public
static
Connection getConnection() {
-
while (pool.isEmpty()) {
-
try {
-
System.out.
println(
"connection pool is null, please wait for a moment~~~");
-
Thread.sleep(
1000);
-
}
catch (
Exception e) {
-
e.printStackTrace();
-
}
-
}
-
return pool.poll();
-
}
-
-
// 释放连接对象,将连接对象归还给连接池
-
public
static void release(
Connection connection) {
-
pool.push(connection);
-
}
-
-
// 根据参数创建表
-
public
static
Map<
Integer,
Long> getColValue(
Connection connection,
TableName tableName, byte[] rk, byte[] cf) {
-
//1. 声明map存放最终结果
-
Map<
Integer,
Long> partition2Offset = new
HashMap<>();
-
try {
-
//2. 获取到表对象
-
Table table = connection.getTable(tableName);
-
Scan scan = new
Scan();
-
//3. 条件
-
Filter
filter = new
RowFilter(
CompareFilter.
CompareOp.
EQUAL, new
BinaryComparator(rk));
-
scan.setFilter(
filter);
-
//4. 创建扫描器
-
ResultScanner scanner = table.getScanner(scan);
-
//5. 遍历
-
for (
Result result : scanner) {
-
List<
Cell> cells = result.listCells();
// 获取到每一个cell(k,v)
-
for (
Cell cell : cells) {
-
//col
-
byte[] column =
CellUtil.cloneQualifier(cell);
-
//value
-
byte[] values =
CellUtil.cloneValue(cell);
-
-
int
partition =
Integer.valueOf(new
String(column));
-
long offset =
Long.valueOf(new
String(values));
-
-
partition2Offset.put(
partition, offset);
-
}
-
}
-
return partition2Offset;
-
-
}
catch (
Exception e) {
-
e.printStackTrace();
-
}
-
return null;
-
}
-
-
//将col和value设置到hbase
-
public
static void
set(
Connection connection,
TableName tableName, byte[] rk, byte[] cf, byte[] col, byte[] value) {
-
try {
-
Table table = connection.getTable(tableName);
-
Put put = new
Put(rk);
-
put.addColumn(cf, col, value);
-
table.put(put);
-
table.close();
-
}
catch (
Exception e) {
-
e.printStackTrace();
-
}
-
}
-
}
2、正式代码:
-
import cn.qphone.hbase.utils.
HBaseUtils
-
import cn.qphone.spark.`
trait`.LoggerTrait
-
import cn.qphone.spark.utils.{
CommonUtils,
SparkUtils}
-
import kafka.common.
TopicAndPartition
-
import kafka.message.
MessageAndMetadata
-
import kafka.serializer.
StringDecoder
-
import org.apache.hadoop.hbase.
TableName
-
import org.apache.hadoop.hbase.util.
Bytes
-
import org.apache.spark.streaming.
StreamingContext
-
import org.apache.spark.streaming.dstream.
InputDStream
-
import org.apache.spark.streaming.kafka.{
HasOffsetRanges,
KafkaUtils,
OffsetRange}
-
-
import scala.collection.{
JavaConversions, mutable}
-
-
/**
-
* 一 使用hbase来手动管理offset信息,保证数据被依次消费
-
* 1. 有:从指定的offset位置开始消费
-
* 2. 没有:从offset为0开始消费
-
*
-
* 二 使用指定的offset向kafka拉取数据
-
* 三 拉取到数据之后进行业务处理
-
* 四 offset需要重新更新到hbase
-
*
-
* create 'spark-topic-offset', 'cf'
-
*
-
* rowkey:topic-group
-
* column:partition:offset
-
* */
-
object Demo7_SparkStreaming_Kafka_HBase extends LoggerTrait{
-
-
def main(args:
Array[
String]):
Unit = {
-
//1. 入口
-
val ssc =
SparkUtils.getLocalStreamingContext(
"Demo6_SparkStreaming_Kafka_Zookeeper",
2)
-
val kafkaParams =
CommonUtils.toMap(
"demo6.properties")
-
val topics =
"bjbigdata1909-1".split(
",").toSet
-
-
//2. 加载数据
-
val messages:
InputDStream[(
String,
String)] = createMsg(ssc, kafkaParams, topics)
-
-
//3. 遍历消息
-
messages.foreachRDD((rdd, btime) => {
-
if (!rdd.isEmpty()) {
-
//3.1 将偏移量读取到东西打印
-
println(
"-"*
100)
-
println(
s"bTime = ${btime}")
-
println(
"#"*
50 +
" " + rdd.count())
-
//3.2 保存最新的偏移量到zookeeper
-
storeOffsets(rdd.asInstanceOf[
HasOffsetRanges].offsetRanges, kafkaParams(
"group.id"))
-
}
-
})
-
-
ssc.start()
-
ssc.awaitTermination()
-
}
-
-
/**
-
* 从hbase中读取手动保存offset信息,然后从kafka指定offset位置开始读取数据,如果没有读取到offset信息,那么从开始位置开始读取信息
-
*/
-
def createMsg(ssc:
StreamingContext, kafkaParams:
Map[
String,
String], topics:
Set[
String]):
InputDStream[(
String,
String)] = {
-
//1. 从hbase中读取offset信息
-
val fromOffsets:
Map[
TopicAndPartition,
Long] = getFromOffsets(topics, kafkaParams(
"group.id"))
-
//2. 读取外部数据
-
var messages:
InputDStream[(
String,
String)] =
null
-
//2.1 判断
-
if (fromOffsets.isEmpty) {
// 如果没有读取到偏移量,说明之前从来没有保存过,从开始的位置开始读取
-
messages =
KafkaUtils.createDirectStream[
String,
String,
StringDecoder,
StringDecoder](ssc, kafkaParams, topics)
-
}
else {
//读取到了偏移量,从指定位置开始读取
-
//2.2 创建messageHandler
-
val messageHandler = (msgHandler:
MessageAndMetadata[
String,
String]) => (msgHandler.key(), msgHandler.message())
-
//2.3 读取指定位置的offset的数据
-
messages =
KafkaUtils.createDirectStream[
String,
String,
StringDecoder,
StringDecoder, (
String,
String)](ssc, kafkaParams, fromOffsets, messageHandler)
-
}
-
messages
-
}
-
-
/**
-
* 根据主题和消费者组来获取到对应的偏移量
-
* 首先我们定义了offset保存在hbase:
-
* 其次,partition中保存了的数据就是我们的offset
-
*/
-
def getFromOffsets(topics:
Set[
String], group:
String) :
Map[
TopicAndPartition,
Long] = {
-
//1. 定义一个结构专门保存偏移量
-
val offsets = mutable.
Map[
TopicAndPartition,
Long]()
-
//1.1 获取到HBase connection
-
val connection =
HBaseUtils.getConnection
-
val tableName =
TableName.valueOf(
"spark-topic-offset")
-
val cf =
Bytes.toBytes(
"cf")
-
-
//2. 遍历主题
-
for(topic <- topics) {
-
//2.1 自定义rowkey
-
val rk =
s"${topic}-${group}".getBytes()
-
//2.2 获取表的分区以及对应的偏移量
-
val partition2Offsets =
HBaseUtils.getColValue(connection, tableName, rk, cf)
-
val partition2Offsets2 =
JavaConversions.mapAsScalaMap(partition2Offsets)
-
//2.3 遍历获取分区:还需要将java的数组转换位scala的数组
-
for ((k, v) <- partition2Offsets2) {
-
offsets.put(
TopicAndPartition(topic, (k+
"").toInt), v)
-
}
-
}
-
HBaseUtils.release(connection)
-
offsets.toMap
-
}
-
-
/**
-
* 将偏移量保存会hbase
-
*/
-
def storeOffsets(offsetRanges:
Array[
OffsetRange], group:
String) = {
-
//0.
-
val connection =
HBaseUtils.getConnection
-
val tableName =
TableName.valueOf(
"spark-topic-offset")
-
val cf =
Bytes.toBytes(
"cf")
-
//1. 遍历偏移量范围的数组
-
for(offsetRange <- offsetRanges) {
-
//2. 获取主题分区以及偏移量
-
val rk =
s"${offsetRange.topic}-${group}".getBytes()
-
val partition = offsetRange.partition
-
val untilOffset = offsetRange.untilOffset
-
-
//3. 将结果保存到hbase
-
HBaseUtils.set(connection, tableName, rk, cf, (partition+
"").getBytes(), (untilOffset+
"").getBytes())
-
}
-
}
-
-
}
三、redis保存
-
package sparkStrreaming.day3
-
-
import java.util
-
-
import kafka.common.TopicAndPartition
-
import kafka.message.MessageAndMetadata
-
import kafka.serializer.StringDecoder
-
import org.apache.spark.SparkConf
-
import org.apache.spark.streaming.dstream.InputDStream
-
import org.apache.spark.streaming.{Seconds, StreamingContext}
-
import utils.CommonUtils
-
import org.apache.curator.framework.CuratorFrameworkFactory
-
import org.apache.curator.retry.ExponentialBackoffRetry
-
import org.apache.spark.streaming.StreamingContext
-
import org.apache.spark.streaming.dstream.InputDStream
-
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}
-
import redis.clients.jedis.Jedis
-
-
import scala.collection.{JavaConversions, mutable}
-
-
-
object Demo8_SparkStreaming_Kafka_Redis {
-
-
-
val jedis =
new Jedis(
"mini1",
6379)
-
-
def main(args:
Array[
String]): Unit = {
-
-
-
val ssc =
new StreamingContext(
new SparkConf().setMaster(
"local[*]").setAppName(
"redis"),Seconds(
2))
-
val kafkaParams:
Map[
String,
String] = CommonUtils.toMap(
"demo6.properties")
-
val topics=
"test1".split(
"\\s+").toSet
-
-
val messages: InputDStream[(
String,
String)] = createMsg(ssc,kafkaParams,topics)
-
-
messages.foreachRDD(
(rdd,btime)=>{
-
if(!rdd.isEmpty()){
-
println(
"-"*
100)
-
println(s
"btime=${btime}")
-
println(
"#"*
50)
-
println(rdd.count())
-
updateOffsets(rdd.asInstanceOf[HasOffsetRanges].offsetRanges,kafkaParams(
"group.id"))
-
}
-
})
-
-
-
ssc.start()
-
ssc.awaitTermination()
-
}
-
-
-
-
def createMsg(ssc:StreamingContext,kafkaParams :
Map[
String,
String],topics:
Set[
String]):InputDStream[(
String,
String)]={
-
val fromOffsets:
Map[TopicAndPartition, Long] = getFromOffsets(topics,kafkaParams(
"group.id"))
-
var messages:InputDStream[(
String,
String)]=
null
-
if(fromOffsets.isEmpty){
-
messages = KafkaUtils.createDirectStream[
String,
String,StringDecoder,StringDecoder](ssc,kafkaParams,topics)
-
}
else{
-
val msgHandler=
(msgHandler:MessageAndMetadata[String,String])=>(msgHandler.key(),msgHandler.message())
-
messages=KafkaUtils.createDirectStream[
String,
String,StringDecoder,StringDecoder,(
String,
String)](ssc,kafkaParams,fromOffsets,msgHandler)
-
}
-
messages
-
}
-
-
-
def getFromOffsets(topics:
Set[
String], group:
String):
Map[TopicAndPartition,Long]={
-
var offsets=mutable.Map[TopicAndPartition,Long]()
-
for(topic<-topics){
-
val key=s
"${topic}_${group}"
-
// val str: String = jedis.get(key).toString
-
val
string: util.Map[
String,
String] = jedis.hgetAll(key)
-
for( partition<-JavaConversions.mapAsScalaMap(
string)){
-
offsets.put(TopicAndPartition(topic,partition._1.toInt),partition._2.toLong)
-
}
-
}
-
offsets.toMap
-
-
}
-
-
-
def updateOffsets(offsetRanges:
Array[OffsetRange], group:
String) = {
-
for(offsetRange<-offsetRanges){
-
val topic=offsetRange.topic
-
val partition=offsetRange.partition
-
val offset=offsetRange.untilOffset
-
-
jedis.hset(s
"${topic}_${group}",partition.toString,offset.toString)
-
}
-
}
-
-
}