![](https://i-blog.csdnimg.cn/blog_migrate/b9be61de25d622240fa85d5a6e83d4cf.png)
![](https://i-blog.csdnimg.cn/blog_migrate/a44c2238aaf4ba36d05f2bf260303025.png)
![](https://i-blog.csdnimg.cn/blog_migrate/7fecb36c516bae205be5e7c1705c5668.png)
![](https://i-blog.csdnimg.cn/blog_migrate/b161549c2c383a01c48680ac5f9b15e9.png)
![](https://i-blog.csdnimg.cn/blog_migrate/a656280f4f1afc948fae53d6de81f35c.png)
![](https://i-blog.csdnimg.cn/blog_migrate/17f0062fa4e4b7a374cd313b0a68bd36.png)
![](https://i-blog.csdnimg.cn/blog_migrate/47e50adfa43a5c40abad8212d06ac0f6.png)
![](https://i-blog.csdnimg.cn/blog_migrate/b014cbc0b8cf424cdcb27c78dd92ecba.png)
![](https://i-blog.csdnimg.cn/blog_migrate/570a647a2e480c4decc95e364aa256f1.png)
![](https://i-blog.csdnimg.cn/blog_migrate/23fbda3488ed497b0bbc12c253dcbc7d.png)
![](https://i-blog.csdnimg.cn/blog_migrate/26ce6a474031dc598d7332ba2db21ca1.png)
![](https://i-blog.csdnimg.cn/blog_migrate/b19aececa753ce9b037b1bb81516ef00.png)
![](https://i-blog.csdnimg.cn/blog_migrate/e409b2b4d7616cf4b7c86fb8de055ea4.png)
![](https://i-blog.csdnimg.cn/blog_migrate/785916e8896f2a6f7a417e8efb2bdf4b.png)
![](https://i-blog.csdnimg.cn/blog_migrate/46adc8bc40ee80b830a25fea163d5bec.png)
![](https://i-blog.csdnimg.cn/blog_migrate/71dc4438132bb3b7ba7f1497eabbf57b.png)
![](https://i-blog.csdnimg.cn/blog_migrate/decc850973a9f3f9b5d7384a8dc4ac54.png)
![](https://i-blog.csdnimg.cn/blog_migrate/2b6de69a5099a08be5dc1e66b3b0e06a.png)
![](https://i-blog.csdnimg.cn/blog_migrate/7ab3cf28466866a0f06de47893efd2ad.png)
![](https://i-blog.csdnimg.cn/blog_migrate/171c60b78af293fc83ea52f7c4ff6b45.png)
![](https://i-blog.csdnimg.cn/blog_migrate/567ed401b78c92ece235f747627f1bdb.png)
![](https://i-blog.csdnimg.cn/blog_migrate/60009c5bdfb572ec5305b859a90a9313.png)
![](https://i-blog.csdnimg.cn/blog_migrate/8794ba34f2d5aae38f70c4792078b6b3.png)
![](https://i-blog.csdnimg.cn/blog_migrate/2408e0b35835b1e72ce30dbc8e408b65.png)
![](https://i-blog.csdnimg.cn/blog_migrate/832ad05f4a722422df1d173a5792ebc7.png)
![](https://i-blog.csdnimg.cn/blog_migrate/c8103abf427514b7900d3c36d0d80776.png)
![](https://i-blog.csdnimg.cn/blog_migrate/6f070a47e79654481bbbf40862b35ad5.png)
![](https://i-blog.csdnimg.cn/blog_migrate/2cbd9af8e9f6653b26eb66d720b75a77.png)
![](https://i-blog.csdnimg.cn/blog_migrate/0c2d7421fac1627b44cd01f64b5dda01.png)
![](https://i-blog.csdnimg.cn/blog_migrate/1b78efc914ca14800c8f9fb09f607c6c.png)
![](https://i-blog.csdnimg.cn/blog_migrate/0f7af7a30e6ce45247ad4f49cd5d79ef.png)
![](https://i-blog.csdnimg.cn/blog_migrate/ea1ff87dd7673eb66b2d48babc2a3a4e.png)
![](https://i-blog.csdnimg.cn/blog_migrate/12c2d1bf2f5131fa6376dc464d9f7d2f.png)
![](https://i-blog.csdnimg.cn/blog_migrate/e379de3f34dcdfc64fc3e5b01c00bca9.png)
![](https://i-blog.csdnimg.cn/blog_migrate/a7caa4918215208299a93224938b4322.png)
![](https://i-blog.csdnimg.cn/blog_migrate/96b488a29a5069962ad82752c6583f32.png)
![](https://i-blog.csdnimg.cn/blog_migrate/19b52a3a93c7fb86c7fbdaf2e8900e8a.png)
![](https://i-blog.csdnimg.cn/blog_migrate/7975f25bdfff8df54c022ca409fbcf5f.png)
示例代码:
package com.aura.bigdata.spark.streaming.p2
import kafka.serializer.StringDecoder
import org.apache.log4j.{Level, Logger}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
/**
* SparkStreaming和Kafka基于Receiver的集成方式
* 本例中使用分区:
* [bigdata@bigdata01 kafka]$ bin/kafka-topics.sh --create --topic t-
1808-1 --zookeeper bigdata01:2181/kafka --partitions 3 --replication-
factor 3
Created topic "t-1808-1".
当我们开启了wal和checkpoint机制之后,已经能够将数据保存起来,但是遇到
的一个问题却还是程序每次启动都是从头开始消费数据,
这回造成数据被多次消费,如果要想做到的数据沿着上一次消费的offset之后再
进行消费,这时因为每次程序的启动都新建了一个StreamingContext,
所持有的数据信息都不相同,所以都是从0开始消费,所以我们就需要在每次新启
动的时候要从之前的失败中给我们恢复出来一个StreamingContext,
那么恢复出来的这个StreamingContext可以持有上一次作业运行的基本元数据信
息。
那么这种代码的构建方式,我们称之为SparkStreaming的DriverHA
编程方式和之前稍微有点不同
工作的时候不常用,因为无法处理偏移量。
*/
object _01SparkStreamingIntegerationWithKafkaReceiverOps {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.spark-project").setLevel(Level.WARN)
if(args == null || args.length <2) {
println(
"""Parameter Errors!Usage: <batchInterval> <checkpoint>
|batchInterval: batchInterval
|checkpoint : checkpoint
""".stripMargin)
System.exit(-1)
}
val Array(batchInterval, checkpoint) = args
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("IntegerationWithKafkaReceiver")
.set("spark.streaming.receiver.writeAheadLog.enable",
"true")//开启高可用之后,必须存储数据
val sc = new SparkContext(conf)
// val ssc = new StreamingContext(sc, Seconds(batchInterval.toLong))
// ssc.checkpoint("file:///E:/data/spark/streaming/chk-1")
// val message = createMessage(ssc)
// val retDS = message.flatMap(_.split("\\s+")).map((_,
1)).reduceByKey(_+_)
// retDS.print
/**
* 之前所有的业务逻辑都需要在该函数中完成
* @return
*/
def createFunc():StreamingContext = {
val ssc = new StreamingContext(sc,
Seconds(batchInterval.toLong))
ssc.checkpoint(checkpoint)
val message = createMessage(ssc)
message.foreachRDD((rdd, bTime) => {
if(!rdd.isEmpty()) {
println("-------------------------------------------")
println(s"Time: $bTime")
println("###############rdd's count: " + rdd.count())
println("-------------------------------------------")
}
})
ssc
}
val ssc = StreamingContext.getOrCreate(checkpoint, createFunc _)
ssc.start()
ssc.awaitTermination()
}
private def createMessage(ssc: StreamingContext):DStream[String] = {
/**
* Create an input stream that pulls messages from Kafka Brokers.
*
* ssc StreamingContext object
* kafkaParams Map of kafka configuration parameters,
* kafka对应的参数信息
*
http://kafka.apache.org/08/configuration.html--->consumer
* "auto.offset.reset" -> "largest"|smallest
* topics Map of (topic_name to numPartitions) to consume.
Each partition is consumed
* in its own thread.
* topic名称和partition之间的映射关系
(topic->partitions)
* storageLevel 持久化级别
* 下面是类型参数--->反省
* K type of Kafka message key
* V type of Kafka message value
* U type of Kafka message key decoder
* T type of Kafka message value decoder
* @return DStream of (Kafka message key, Kafka message value)
* 返回值是一个DStream,类型<kafka_msg_key,
kafka_msg_value>
*/
val kafkaParams = Map[String, String](
"group.id" -> "nb-1808-group",
"zookeeper.connect" ->
"bigdata01:2181,bigdata03:2181,bigdata02:2181/kafka",
"auto.offset.reset" -> "smallest"
)
val topics = Map(
"t-1808-1" -> 3
)
val kafkaStream: ReceiverInputDStream[(String, String)] =
KafkaUtils.createStream[String, String, StringDecoder, StringDecoder](ssc,
kafkaParams,
topics,
StorageLevel.MEMORY_AND_DISK_SER
)
kafkaStream.map(_._2)
}
}
上一种方法会造成数据丢失,所以常用下面的这种
package com.aura.bigdata.spark.streaming.p2
import kafka.serializer.StringDecoder
import org.apache.log4j.{Level, Logger}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
/**
* SparkStreaming和Kafka基于Receiver的集成方式
* 本例中使用分区:
* Direct之checkpoint方式保证数据一条也不能少一条也不能多的操作数据
方式,但是这种方式在企业生产中多不用,
* 因为这样会在hdfs上面产生大量的小文件,而且因为直接操作本地磁盘,效
率比较低。
* 所以我们一般都是用ZooKeeper或者HBase做管理消费的偏移量。
* "auto.offset.reset" -> "smallest"在生产中你必须要给我设置成
smallest
* 如果设置成为largest的话,历史数据读不到。
*/
object _02SparkStreamingIntegerationWithKafkaDirectOps {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.spark-project").setLevel(Level.WARN)
if(args == null || args.length <2) {
println(
"""Parameter Errors!Usage: <batchInterval> <checkpoint>
|batchInterval: batchInterval
|checkpoint: checkpoint
""".stripMargin)
System.exit(-1)
}
val Array(batchInterval, checkpoint) = args
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("IntegerationWithKafkaDirect")
val sc = new SparkContext(conf)
def createFunc():StreamingContext = {
val ssc = new StreamingContext(sc,
Seconds(batchInterval.toLong))
ssc.checkpoint(checkpoint)
val message = createMessage(ssc)
message.foreachRDD((rdd, bTime) => {
if(!rdd.isEmpty()) {
println("-------------------------------")
println(s"Time: $bTime")
println("#########rdd's count: " + rdd.count())
println("-------------------------------")
}
})
ssc
}
val ssc = StreamingContext.getActiveOrCreate(checkpoint,
createFunc _)
ssc.start()
ssc.awaitTermination()
}
private def createMessage(ssc:
StreamingContext):InputDStream[(String, String)] = {
val topics = "t-1808-1".split(",").toSet
val kafkaParams = Map[String, String](
"bootstrap.servers" ->
"bigdata01:9092,bigdata02:9092,bigdata03:9092",
"auto.offset.reset" -> "smallest"//必须要设置成smallest,
如果设置为largest就操作数据丢失
)
val kafkaStream:InputDStream[(String, String)] =
KafkaUtils.createDirectStream[String, String, StringDecoder,
StringDecoder](
ssc,
kafkaParams,
topics
)
kafkaStream
}
}
![](https://i-blog.csdnimg.cn/blog_migrate/af2d5c4e6f2f92eda1caafedda79b7ee.png)
![](https://i-blog.csdnimg.cn/blog_migrate/b0a98c676632bf970e43ddb3d7bc7c40.png)
![](https://i-blog.csdnimg.cn/blog_migrate/2c2e34c436033cfe63256c4e640b317f.png)
![](https://i-blog.csdnimg.cn/blog_migrate/8cbef0a9531050d8b170df555348d1a6.png)
![](https://i-blog.csdnimg.cn/blog_migrate/cf4d223765902c0d2db9760de999a927.png)
![](https://i-blog.csdnimg.cn/blog_migrate/307c5aa7a3a40c731e5a3f8e60b1eaaa.png)
![](https://i-blog.csdnimg.cn/blog_migrate/337ba910a2e38fa9d592a1e50926faa3.png)
object _02SparkStreamingIntegerationWithKafkaDirectOps {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.spark-project").setLevel(Level.WARN)
if(args == null || args.length <2) {
println(
"""Parameter Errors!Usage: <batchInterval> <checkpoint>
|batchInterval: batchInterval
|checkpoint: checkpoint
""".stripMargin)
System.exit(-1)
}
val Array(batchInterval, checkpoint) = args
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("IntegerationWithKafkaDirect")
val sc = new SparkContext(conf)
def createFunc():StreamingContext = {
val ssc = new StreamingContext(sc,
Seconds(batchInterval.toLong))
ssc.checkpoint(checkpoint)
val message = createMessage(ssc)
message.foreachRDD((rdd, bTime) => {
if(!rdd.isEmpty()) {
println("-------------------------------------------")
println(s"Time: $bTime")
println("###########rdd's count: " + rdd.count())
println("-------------------------------------------")
}
})
ssc
}
val ssc = StreamingContext.getActiveOrCreate(checkpoint,
createFunc _)
ssc.start()
ssc.awaitTermination()
}
private def createMessage(ssc: StreamingContext):InputDStream[(String,
String)] = {
val topics = "t-1808-1".split(",").toSet
val kafkaParams = Map[String, String](
"bootstrap.servers" ->
"bigdata01:9092,bigdata02:9092,bigdata03:9092",
"auto.offset.reset" -> "smallest"//必须要设置成smallest,如果
设置为largest就操作数据丢失
)
val kafkaStream:InputDStream[(String, String)] =
KafkaUtils.createDirectStream[String, String, StringDecoder,
StringDecoder](
ssc,
kafkaParams,
topics
)
kafkaStream
}
}
zookeeper
package com.aura.bigdata.spark.streaming.p2
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.curator.framework.CuratorFrameworkFactory
import org.apache.curator.retry.ExponentialBackoffRetry
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import scala.collection.{JavaConversions, mutable}
/**
* 都是基于Direct-api(只有这种方式才可以手动的管理offset)
* 使用zookeeper来手动管理消费kafka的偏移量信息,基于该方式我们可以完全抛弃掉checkpoint
* 这里我们要操作zookeeper
* 用zookeeper工具项目--Apache curator(是对原生zk的封装,相当于SpringMVC中的controller对Servlet的封装)
* 通过学习ZK的api,我们可以连接,使用zk来管理kafka某个topic对应的各个分区的偏移量,只需要读取的时候从对应的目录下面
* 拿到偏移量,然后从kafka中读取该偏移量以后的数据,消费完毕之后将最新的偏移量更新回zk,这一套流程是否?
*
*/
object _03SparkStreamingKafkaOffsetZookeeperOps {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.spark-project").setLevel(Level.WARN)
if(args == null || args.length <3) {
println(
"""Parameter Errors!Usage: <batchInterval> <topic> <group>
|batchInterval: batchInterval
|topic : topic
|group : group
""".stripMargin)
System.exit(-1)
}
val Array(batchInterval, topic, group) = args
val spark = SparkSession.builder()
.appName("KafkaOffsetZookeeper")
.master("local[*]")
.getOrCreate()
val kafkaParams = Map[String, String](
"bootstrap.servers" ->
"bigdata01:9092,bigdata02:9092,bigdata03:9092",
"auto.offset.reset" -> "smallest"//必须要设置成smallest,如果
设置为largest就操作数据丢失
)
val ssc = new StreamingContext(spark.sparkContext, Seconds(batchInterval.toLong))
val messages = createMsg(ssc, kafkaParams, topic, group)
messages.foreachRDD((rdd, bTime) => {
if(!rdd.isEmpty()) {
println("-------------------------------------------")
println(s"Time: $bTime")
println("#####################rdd's count: " + rdd.count())
println("-------------------------------------------")
}
store(rdd.asInstanceOf[HasOffsetRanges].offsetRanges, group)
})
ssc.start()
ssc.awaitTermination()
}
/**
* 使用Direct方式手动管理kafka的offset
* step 1、从zookeeper中读取对应的偏移量
* 1、如果没有从zookeeper读取到偏移量,从kafka的topic的最开始
的地方读取,说明是第一次读取数据
* 就使用我们昨天的方式来读取即可
* 2、如果从zookeeper中读取到了偏移量,拿着偏移量从kafka中读取
对应的数据
* 是比较复杂的构造方式来读取,因为要带着偏移量从kafka中读数
* step 2、开始消费数据
* step 3、更新zookeeper中的偏移量
*/
def createMsg(ssc: StreamingContext, kafkaParams: Map[String, String], topic:String, group:String) = {
/*
第一步:从zookeeper中读取对应偏移量信息
*/
val (offsets, flag) = getFromOffsets(topic, group)
var message:InputDStream[(String, String)] = null
/**
* 第二步:根据流的偏移量状态(是否存在),创建kafka消息流
*/
if(flag) { //该flag表从zookeeper中是否读取到了数据偏移量,true说
明有偏移量,false没有偏移量
val messageHandler = (mmd: MessageAndMetadata[String, String]) => (mmd.key, mmd.message)
message = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](
ssc,
kafkaParams,
offsets,
messageHandler
)
} else {//第一次读取,从头开始读取
message = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
ssc, kafkaParams, topic.split(",").toSet)
}
message
}
/**
* 第三步:消费完成数据之后,进行存储
* 1°、判断当前目录是否存在
* 2°、更新数据
*/
def store(offsetRanges: Array[OffsetRange], group:String): Unit = {
for (offsetRange <- offsetRanges) {
val topic = offsetRange.topic
val partition = offsetRange.partition
val offset = offsetRange.untilOffset
val path = s"${zkRootPath}/${topic}/${group}/${partition}"
checkExists(path)
//更新数据
client.setData().forPath(path, new String(offset + "").getBytes())
}
}
/**
* kafka在zookeeper的偏移量保存的路径
* /kafka/mykafka/offsets/{topic}/{group.id}/${partition}
*/
def getFromOffsets(topic:String, group:String): (Map[TopicAndPartition, Long], Boolean) = {
/**
* 第一步:获取要读取数据的目录
* 1°、如果目录不存在,创建新的目录
* 2°、读取对应目录的数据
* 3°、封装成结果集返回
*/
val path = s"${zkRootPath}/${topic}/${group}"
checkExists(path)
//到这里对应的path目录一定是存在的
/*val offsets = mutable.Map[TopicAndPartition, Long]()
for(partition <- JavaConversions.asScalaBuffer(client.getChildren.forPath(path))) {
val offset = new String(client.getData.forPath(s"${path}/${partition}")).toLong
val tap = TopicAndPartition(topic, partition.toInt)
offsets.put(tap, offset)
}*/
val offsets = for(partition <- JavaConversions.
asScalaBuffer(client.getChildren.forPath(path))) yield {
val offset = new String(client.getData.forPath(s"${path}/${partition}")).toLong
val tap = TopicAndPartition(topic, partition.toInt)
(tap, offset)
}
if(offsets.isEmpty) {//没有读取到数据
(offsets.toMap, false)
} else {
(offsets.toMap, true)
}
}
private def checkExists(path: String) = {
if (client.checkExists().forPath(path) == null) {
//offset目录不存在
client.create().creatingParentsIfNeeded().forPath(path)
}
}
val zkRootPath = "/offsets"
val client = {
val client = CuratorFrameworkFactory
.builder()
.connectString("bigdata01:2181,bigdata03:2181,bigdata02:2181/kafka")
.retryPolicy(new ExponentialBackoffRetry(1000, 3))
.namespace("mykafka")
.build()
client.start()
client
}
}
高级编程之java
package com.aura.bigdata.spark.streaming.p2.zk;
import com.aura.bigdata.spark.util.MailUtil;
import org.apache.curator.framework.CuratorFramework;
import org.apache.curator.framework.CuratorFrameworkFactory;
import org.apache.curator.retry.ExponentialBackoffRetry;
import org.apache.zookeeper.WatchedEvent;
import org.apache.zookeeper.Watcher;
import java.util.List;
/**
* 对zookeeper中的目录进行监控
* 当监控到一个目录中少了一个节点,可以发送各种各样的报警邮件,短信、微信
* 在zk的监控通过接口Watcher来完成
*/
public class CuratorWatcherTest implements Watcher{
private CuratorFramework client = null;
private List<String> orginalList;
public CuratorWatcherTest() {
client = CuratorFrameworkFactory.builder()
.connectString("bigdata01:2181,bigdata03:2181,bigdata02:2181")
.retryPolicy(new ExponentialBackoffRetry(1000, 3))
.build();
client.start();
try {
orginalList = client.getChildren().usingWatcher(this).forPath("/test");
} catch (Exception e) {
e.printStackTrace();
}
}
private void start() {
while(true) {
}
}
/**
* 一旦监控的当前目录发生变化,则会调用当前方法
* @param event
*/
@Override
public void process(WatchedEvent event) {
System.out.println("------>目录发生变化,process目录被检测到了,调用了process方法");
try {
List<String> currentList = client.getChildren().usingWatcher(this).forPath("/test");
if(currentList.size() < orginalList.size()) {//监控目录中有子目录被删除
for(String orginal : orginalList) {
if(!currentList.contains(orginal)) {
//当前目录被删除--->发送报警邮件--->
MailUtil.sendMail("号外号外!服务器删除了", "哪个傻子把服务器删除了,拉出来干掉祭天!");
}
}
} else if(currentList.size() > orginalList.size()) {//有子目录被新增
for(String current : currentList) {
if(!orginalList.contains(current)) {
//当前目录被新增
System.out.println("/test目录下新增子目录:" + current);
}
}
}
orginalList = currentList;
} catch (Exception e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
CuratorWatcherTest cwt = new CuratorWatcherTest();
cwt.start();
}
}
高级编程之scala
package com.aura.bigdata.spark.streaming.p2.zk
import org.apache.curator.framework.CuratorFrameworkFactory
import org.apache.curator.retry.ExponentialBackoffRetry
import scala.collection.JavaConversions
/**
* 通过该案例来学些Curator封装zookeeper的api,去操作zk
* 顺便学习一下如何监控zk中的数据
*/
object CuratorTest {
def main(args: Array[String]): Unit = {
val client = {
val client = CuratorFrameworkFactory
.builder()
.connectString("bigdata01:2181,bigdata03:2181,bigdata02:2181")
.retryPolicy(new ExponentialBackoffRetry(1000, 3))
.namespace("test")
.build()
client.start()
client
}
//在zk的下面创建一个目录 --->是会在zk的根目录下操作/test/haha
// client.create().creatingParentsIfNeeded().forPath("/heihei/haha", "heiha".getBytes)
// client.delete().forPath("/heihei/haha")
val list = client.getChildren.forPath("")
for(child <- JavaConversions.asScalaBuffer(list)) {
// println(child)
//各个子节点对应的完整目录
val data = new String(client.getData.forPath(child))
println(s"${child}--->${data}")
client.setData().forPath(child, "1".getBytes)
println(s"${child}--newData->${data}")
}
client.close()
}
}
4,SparkStreaming整合kafka反写之创建和保存kafka偏移量,以及投影操作
package com.aura.bigdata.spark.streaming.p3
import com.aura.bigdata.spark.streaming.p3.kk.MyProducer
import com.aura.bigdata.spark.util.DateUtils
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.curator.framework.CuratorFrameworkFactory
import org.apache.curator.retry.ExponentialBackoffRetry
import org.apache.log4j.{Level, Logger}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import scala.collection.JavaConversions
/**
* 当前案例,主要完成的操作是从kafka读取原始数据,经过在线ETL,将清洗之后的数据反写到Kafka
*
*
* 在线ETL
* 数据源:kafka的某一个topic
* 数据的处理:SparkStreaming程序
* 数据的目的:kafka的另外一个topic
* 原始数据格式:
* userid xxx access_time
scr_ip dest_ip src_port dest_port
host
url
* <<<!>>>3111<<<!>>>,<<<!>>>238<<<!>>>,<<<!>>>20181111132902<<<!>>>,<<<!>>>58.223.1.112<<<!>>>,<<<!>>>202.102.92.18<<<!>>>,<<<!>>>59948<<<!>>>,<<<!>>>80<<<!>>>,<<<!>>>www.sumecjob.com<<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>>http://www.sumecjob.com/Social.aspx<<<!>>>,<<<!>>>2556928065<<<!>>>
* 经过SparkStreaming的投影计算,保留其中的核心字段
* userid
* access_time
* src_ip
* src_port
* dest_ip
* dest_port
* url
* 格式为:
* userid|access_time|src_ip:src_port|dest_ip:dest_port|url
* 其中日期格式为yyyy-MM-dd HH:mm:ss
*
*
* 基本准备:
* [bigdata@bigdata01 kafka]$ bin/kafka-topics.sh --create --topic t-1808-src --partitions 3 --replication-factor 3 --zookeeper bigdata01:2181/kafka
Created topic "t-1808-src".
[bigdata@bigdata01 kafka]$ bin/kafka-topics.sh --create --topic t-1808-dest --partitions 3 --replication-factor 3 --zookeeper bigdata01:2181/kafka
Created topic "t-1808-dest".
异常:
我们使用kryo的序列化方式对kafka-producer进行序列化的时候,报错
.NotSerializableException: com.aura.bigdata.spark.streaming.p3.kk.MyProducer
是因为kryo序列化不稳定造成的,这个时候借助于原生的序列化方式来进行操作
因为构造函数的问题,没法直接从driver之上拷贝数据到executor,这是使用广播变量,在广播变量中会将该数据在此进行序列化
这个时候直接来完成操作即可。
所以使用sparkstreaming将数据落地到kafka中,最终是使用广播变量来完成,并提高效率的。
*/
object _01SparkStreamingFromKafka2KafkaOps {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.spark-project").setLevel(Level.WARN)
if(args == null || args.length <4) {
println(
"""Parameter Errors!Usage: <batchInterval> <from> <group> <to>
|batchInterval: batchInterval
|from : from topic
|group : group
|to : to topic
""".stripMargin)
System.exit(-1)
}
val Array(batchInterval, fromTopic, group, toTopic) = args
val kafkaParams = Map[String, String](
"bootstrap.servers" -> "bigdata01:9092,bigdata02:9092,bigdata03:9092",
"auto.offset.reset" -> "smallest"//必须要设置成smallest,如果设置为largest就操作数据丢失
)
val conf = new SparkConf()
.setAppName("FromKafka2Kafka")
.setMaster("local[*]")
// .set("spark.serializer", classOf[KryoSerializer].getName)
// .registerKryoClasses(Array(classOf[MyProducer[String, String]]))
val sc = new SparkContext(conf)
val ssc = new StreamingContext(sc, Seconds(batchInterval.toLong))
/* 3个步骤:
1、从zk中读取对应的偏移量
* 2、消费数据(投影操作)--->反写到kafka
* 3、将offset更新回zk中
*/
//1、从zk中读取对应的偏移量
val messages:InputDStream[(String, String)] = createMsg(ssc, kafkaParams, fromTopic, group)
//2、消费数据(投影操作)--->反写到kafka
messages.foreachRDD((rdd, bTime) => {
if(!rdd.isEmpty()) {
//投影操作-->rdd
projection(toTopic, rdd)
//3、将offset更新回zk中
store(rdd.asInstanceOf[HasOffsetRanges].offsetRanges, group)
}
})
ssc.start()
ssc.awaitTermination()
}
/**
* 做原始数据的投影操作
* 原始数据格式:
* userid xxx access_time scr_ip dest_ip src_port dest_port host url
* <<<!>>>3111<<<!>>>,<<<!>>>238<<<!>>>,<<<!>>>20181111132902<<<!>>>,<<<!>>>58.223.1.112<<<!>>>,<<<!>>>202.102.92.18<<<!>>>,<<<!>>>59948<<<!>>>,<<<!>>>80<<<!>>>,<<<!>>>www.sumecjob.com<<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>>http://www.sumecjob.com/Social.aspx<<<!>>>,<<<!>>>2556928065<<<!>>>
* 经过SparkStreaming的投影计算,保留其中的核心字段
* userid
* access_time
* src_ip
* src_port
* dest_ip
* dest_port
* url
* 格式为:
* userid|access_time|src_ip:src_port|dest_ip:dest_port|url
* 其中日期格式为yyyy-MM-dd HH:mm:ss或者时间戳
*
*/
def projection(toTopic:String, rdd:RDD[(String, String)]): Unit = {
val projectionRDD:RDD[String] = rdd.map { case (key, line) => {
val fields = line.split("<<<!>>>,<<<!>>>")
val userId = fields(0).substring(fields(0).lastIndexOf(">") + 1)
//将yyyyMMddHHmmss格式的数据,转化为时间戳
println(fields(2))
// val accessTime = DateUtils.time2TimeStamp(fields(2), DateUtils.TIME_MINUTE_FORMAT2)
val accessTime = fields(2)
val srcIp = fields(3)
val destIp = fields(4)
val srcPort = fields(5)
val destPort = fields(6)
val url = fields(13)
val msg = s"${userId}|${accessTime}|${srcIp}:${srcPort}|${destIp}:${destPort}|${url}"
msg
}}
//此时的程序相当于kafka的生产者
projection2Kafka(toTopic, projectionRDD)
}
/**
* 将rdd中的数据反写到kafka中
* @param rdd
*/
def projection2Kafka(topic:String, rdd:RDD[String]): Unit = {
val myProducer = new MyProducer[String, String]()
val productBC:Broadcast[MyProducer[String, String]] = rdd.sparkContext.broadcast(myProducer)
rdd.foreach(msg => {
val p = productBC.value
p.send(topic, msg)
})
}
/**
* 保存kafka的偏移量
* @param offsetRanges
* @param group
*/
def store(offsetRanges: Array[OffsetRange], group:String): Unit = {
for (range <- offsetRanges) {
val topic = range.topic
val partition = range.partition
val untilOffset = range.untilOffset
val path = s"${zkRootPath}/${topic}/${group}/${partition}"
checkExists(path)
curator.setData().forPath(path, (untilOffset + "").getBytes())
}
}
/**
* 从kafka中创建数据,手动管理偏移量的时候,要考虑一个问题,是否能够从zk中读取到offset
* 如果读到了,使用offset的方法来得到kafka数据
* 如果没有读到,直接从kafka最开始的位置读取数据
* @param ssc
* @param fromTopic
* @param group
* @return
*/
def createMsg(ssc:StreamingContext, kafkaParams:Map[String, String], fromTopic:String, group:String):InputDStream[(String, String)] = {
val offsets:Map[TopicAndPartition, Long] = getFromOffsets(fromTopic, group)
var messages:InputDStream[(String, String)] = null
if(offsets.isEmpty) {//如果没有读到,直接从kafka最开始的位置读取数据
messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
ssc,
kafkaParams,
fromTopic.split(",").toSet
)
} else {//从给定偏移量的位置开始读取数据
val messageHandler = (mmd: MessageAndMetadata[String, String]) => (mmd.key, mmd.message)
messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](
ssc,
kafkaParams,
offsets,
messageHandler
)
}
messages
}
/**
* 从zk的对应的位置下来读取对应offset
* /kafka/mykafka/offsets/${topic}/${group}/${partition}
* --->Curator
* @param topic
* @param group
*/
def getFromOffsets(topic:String, group:String):Map[TopicAndPartition, Long] = {
//第一步:拼接读取offset的在zk中的路径
val path = s"${zkRootPath}/${topic}/${group}"
checkExists(path)//经过check之后,path目录一定存在
val offsets = for(partition <- JavaConversions.asScalaBuffer(
curator.getChildren.forPath(path))) yield {
val partitionPath = s"${path}/${partition}"
val offset = new String(curator.getData.forPath(partitionPath)).toLong
val tap = TopicAndPartition(topic, partition.toInt)
(tap, offset)
}
offsets.toMap
}
def checkExists(path:String): Unit = {
if(curator.checkExists().forPath(path) == null) {
curator.create().creatingParentsIfNeeded().forPath(path)
}
}
val zkRootPath = "offsets"
//zk操作的客户端
val curator = {
val curator = CuratorFrameworkFactory.builder()
.connectString("bigdata01:2181,bigdata02:2181,bigdata03:2181/kafka")
.namespace("mykafka")
.retryPolicy(new ExponentialBackoffRetry(1000, 3))
.build()
curator.start()
curator
}
}
5,kafka生产者测试类
package com.aura.bigdata
import java.util.Properties
import org.apache.kafka.clients.producer.{KafkaProducer, Producer, ProducerRecord}
import scala.util.Random
/**
* kafka生产者测试类
*/
object KafkaProducerTest {
def main(args: Array[String]): Unit = {
val properties = new Properties()
properties.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
properties.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
/*
acks
0:则意味者producer不管partition的leader是否将数据保存成功与否,
进行下一次的数据发送
1:则意味者producer只有partition的leader将数据保存成功之后,才
进行下一次的数据发送,而不管follower是否同步成功
all:则意味者producer之后partition的leader和follower都把数据
保存成功之后,才进行下一次的数据发送
*/
properties.put("acks", "1")
properties.put("bootstrap.servers",
"bigdata01:9092,bigdata02:9092,bigdata03:9092")
properties.put("linger.ms", "0")//进行下一次发送数据的延迟时间
val producer:Producer[String, String] = new KafkaProducer[String,
String](properties)
val topic = "t-1808-1"
val array = Array(
"Stay hungry Stay foolish",
"I think i am",
"Roma was not builded in one day",
"An Apple a day, keep the doctor away"
)
val random = new Random()
for (i <- 0 until 100) {
val producerRecord = new ProducerRecord[String, String](topic, i + "",
array(random.nextInt(array.length)))
producer.send(producerRecord)
}
producer.close()
}
}
6,sparkstreaming整合kafka之数据读取速率问题
package com.aura.bigdata.spark.streaming.p3
import com.aura.bigdata.spark.util.KafkaManager
import org.apache.curator.framework.CuratorFrameworkFactory
import org.apache.curator.retry.ExponentialBackoffRetry
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka.HasOffsetRanges
/**
* 管理kafka数据消费的速率,主要原因就是为了控制数据生成的速率和数据消费的速率
* 不匹配的问题,如果生产速率高于消费的速率,就会形成数据“堰塞湖”现象,这是我们需要
* 在实际开发过程中要避免的,也就是要适当的控制数据消费的速率,能跟上程序计算的能力。
* 通过配置参数:spark.streaming.kafka.maxRatePerPartition,来控制spark程序从kafka中
* 的每一个分区中美妙中能够监听多少条记录
* 加入我们将其设置为spark.streaming.kafka.maxRatePerPartition=10-->spark程序每秒钟从每个分区最大能读取10条记录
* batchInterval=2s
* kafka的分区有3个
* 那么问?SparkStreaming程序,每个批次最大能接受多少条记录?
* 10 * 3 * 2 = 60
*/
object _02SparkStreamingCostKafkaRateOps {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.spark-project").setLevel(Level.WARN)
if(args == null || args.length <3) {
println(
"""Parameter Errors!Usage: <batchInterval> <topic> <group>
|batchInterval: batchInterval
|topic : topic
|group : group
""".stripMargin)
System.exit(-1)
}
val Array(batchInterval, topic, group) = args
val spark = SparkSession.builder()
.appName("CostKafkaRate")
.master("local[*]")
.config("spark.streaming.kafka.maxRatePerPartition", "10")
.getOrCreate()
val kafkaParams = Map[String, String](
"bootstrap.servers" ->
"bigdata01:9092,bigdata02:9092,bigdata03:9092",
"auto.offset.reset" -> "smallest"//必须要设置成smallest,如果设置为
largest就操作数据丢失
)
val ssc = new StreamingContext(spark.sparkContext,
Seconds(batchInterval.toLong))
val messages = KafkaManager.createMsg(ssc, kafkaParams, topic, group,
curator, zkRootPath)
messages.foreachRDD((rdd, bTime) => {
if(!rdd.isEmpty()) {
println("-------------------------------------------")
println(s"Time: $bTime")
println("#####################rdd's count: " + rdd.count())
println("-------------------------------------------")
}
KafkaManager.store(rdd.asInstanceOf[HasOffsetRanges].offsetRanges,
group, curator, zkRootPath)
})
ssc.start()
ssc.awaitTermination()
}
val zkRootPath = "offsets"
//zk操作的客户端
val curator = {
val curator = CuratorFrameworkFactory.builder()
.connectString("bigdata01:2181,bigdata02:2181,bigdata03:2181/kafka")
.namespace("mykafka")
.retryPolicy(new ExponentialBackoffRetry(1000, 3))
.build()
curator.start()
curator
}
}
7,sparkstreaming输出的幂等写入
package com.aura.bigdata.spark.streaming.p3.exactly
import java.sql.DriverManager
import com.aura.bigdata.spark.util.KafkaManager
import org.apache.curator.framework.CuratorFrameworkFactory
import org.apache.curator.retry.ExponentialBackoffRetry
import org.apache.log4j.{Level, Logger}
import org.apache.spark.streaming.kafka.HasOffsetRanges
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, TaskContext}
/**
* 为了保证的数据消费一致性语义,提供一个幂等操作
*/
object KafkaOffsetIdempotent {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.project-spark").setLevel(Level.WARN)
val sparkConf = new SparkConf().setAppName("test").setMaster("local[2]")
val processingInterval = 2
val brokers = "bigdata01:9092,bigdata02:9092,bigdata03:9092"
val topic = "mytopic1"
val group = "g-1808"
// Create direct kafka stream with brokers and topics
val kafkaParams = Map[String, String](
"metadata.broker.list" -> brokers,
"auto.offset.reset" -> "smallest"
)
/*
1. 创建测试的mysql数据库
create database test;
2. 建表
create table myorders(name varchar(100), orderid varchar(100) primary key);
3. 新建topic: mytopic1
kafka-topics.sh --zookeeper bigdata01:2181/kafka --create --topic
mytopic1 --partitions 3 --replication-factor 1
4. 往mytopic1发送数据, 数据格式为 "字符,数字" 比如 abc,3
*/
val ssc = new StreamingContext(sparkConf, Seconds(processingInterval))
val zkTopicOffsetPath = "/offsets"
val messages = KafkaManager.createMsg(ssc, kafkaParams, topic, group, client,
zkTopicOffsetPath)
val jdbcUrl = "jdbc:mysql://localhost:3306/test"
val jdbcUser = "root"
val jdbcPassword = "sorry"
messages.foreachRDD(rdd=>{
rdd.map(x=>x._2).foreachPartition(partition =>{
val dbConn = DriverManager.getConnection(jdbcUrl, jdbcUser, jdbcPassword)
// upsert update insert
partition.foreach(msg=>{
val name = msg.split(",")(0)
val orderid = msg.split(",")(1)
val sql = s"insert into myorders(name, orderid) values ('$name',
'$orderid') ON DUPLICATE KEY UPDATE name='${name}'"
val pstmt = dbConn.prepareStatement(sql)
pstmt.execute()
})
// dbConn.commit()
dbConn.close()
})
KafkaManager.store(rdd.asInstanceOf[HasOffsetRanges].offsetRanges, group,
client, zkTopicOffsetPath)
})
ssc.start()
ssc.awaitTermination()
}
val client = {//代码块-->zookeeper
val client = CuratorFrameworkFactory.builder()
.connectString("bigdata01:2181,bigdata02:2181,bigdata03:2181/kafka")
.retryPolicy(new ExponentialBackoffRetry(1000, 3))
.namespace("mykafka")
.build()
client.start()
client
}
}
8,SparkStreaming和kafka整合之exactly-once之事务实现
package com.aura.bigdata.spark.streaming.p3.exactly
import java.sql.ResultSet
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, TaskContext}
import scalikejdbc.{ConnectionPool, DB, SQL}
/**
* 事务{
* 保存数据
* 保存offset
* }
*
*
* 1. 创建测试的mysql数据库
create database test;
2. 新建topic: mytopic1
kafka-topics.sh --zookeeper bigdata01:2181/kafka --create --topic mytopic1
--partitions 3 --replication-factor 1
3. 建表
create table mytopic(topic varchar(200), partid int, offset bigint);
create table mydata(name varchar(200), id int);
初始化表:
insert into mytopic(topic, partid, offset) values('mytopic1',0,0);
insert into mytopic(topic, partid, offset) values('mytopic1',1,0);
insert into mytopic(topic, partid, offset) values('mytopic1',2,0);
4. 往mytopic1发送数据, 数据格式为 "字符,数字" 比如 abc,3
5. 在pom文件加入依赖
<dependency>
<groupId>org.scalikejdbc</groupId>
<artifactId>scalikejdbc_2.10</artifactId>
<version>2.2.1</version>
</dependency>
*/
object KafkaOffsetTransanction {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf()
.setAppName("test")
.setMaster("local[2]")
.set("spark.streaming.kafka.maxRatePerPartition", "10")
val processingInterval = 2
val brokers = "bigdata01:9092,bigdata02:9092,bigdata03:9092"
val topic = "mytopic2"
// Create direct kafka stream with brokers and topics
val topicsSet = topic.split(",").toSet
val kafkaParams = Map[String, String](
"metadata.broker.list" -> brokers,
"auto.offset.reset" -> "smallest"
)
val ssc = new StreamingContext(sparkConf, Seconds(processingInterval))
val group = "g-1807"
val driver = "com.mysql.jdbc.Driver"
val jdbcUrl = "jdbc:mysql://localhost:3306/test"
val jdbcUser = "root"
val jdbcPassword = "sorry"
// 设置jdbc
Class.forName(driver)
// 设置连接池
ConnectionPool.singleton(jdbcUrl, jdbcUser, jdbcPassword)
val fromOffsets:Map[TopicAndPartition, Long] = DB.readOnly {
implicit session => SQL("select topic, partid, offset from mytopic").
map {case r =>
TopicAndPartition(r.string(1), r.int(2)) -> r.long(3)
}.list.apply().toMap
}
val messageHandler = (mmd : MessageAndMetadata[String, String]) => (mmd.topic,
mmd.message())
val messages = KafkaUtils.createDirectStream[String, String, StringDecoder,
StringDecoder, (String, String)](
ssc,
kafkaParams,
fromOffsets,
messageHandler
)
messages.foreachRDD(rdd=> {
if(!rdd.isEmpty()) {
rdd.foreachPartition(partiton=>{
if(!partiton.isEmpty) {
val offsetRanges =
rdd.asInstanceOf[HasOffsetRanges].offsetRanges
val pOffsetRange = offsetRanges(TaskContext.get.partitionId)
// localTx
DB.localTx { implicit session =>
//保存数据
partiton.foreach(msg=>{
// 或者使用scalike的batch 插入
val name = msg._2.split(",")(0)
val id =msg._2.split(",")(1)
val dataResult = SQL(s"""insert into mydata(name,id)
values ('${name}',${id})""").execute().apply()
})
val i = 1 / 0
//保存偏移量
val offsetResult =
SQL(s"""update mytopic set offset =
${pOffsetRange.untilOffset} where topic =
'${pOffsetRange.topic}' and partid =
${pOffsetRange.partition}""").update.apply()
}
}
})
}
})
ssc.start()
ssc.awaitTermination()
}
}