kafka spark mysql_sparkstraming 接收kafka数据到mysql(offset保存在zk)

/**

* @Author: 唐

* @Date: 2020/3/25 22:32*/

/**

* @Author: 唐

* @Date: 2020/3/25 20:12*/import java.sql.{DriverManager, PreparedStatement}

import kafka.api.{OffsetRequest, PartitionOffsetRequestInfo, TopicMetadataRequest}

import kafka.common.TopicAndPartition

import kafka.consumer.SimpleConsumer

import kafka.message.MessageAndMetadata

import kafka.serializer.StringDecoder

import kafka.utils.{ZKGroupTopicDirs, ZkUtils}

import org.apache.spark.streaming.dstream.InputDStream

import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}

import org.apache.spark.streaming.{Durations, Seconds, StreamingContext}

import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable

import scala.util.{Success, Try}objecttest01 {

def main(args: Array[String]): Unit={

val conf= newSparkConf()

.setMaster("local[*]")

.setAppName("Chapter8_4_5")

val sc= newSparkContext(conf)

val ssc= new StreamingContext(sc, Durations.seconds(10))

val topics= Set("spark_streaming_test")

val kafkaParams=mutable.Map[String, String]()

kafkaParams.put("bootstrap.servers", "min01:9092,min02:9092,min03:9092") --kafka 集群信息

kafkaParams.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer") --反序列化 key (因为sparkstreaming要把数据从kafka里读出来)

kafkaParams.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")--反序列化 value

kafkaParams.put("session.timeout.ms", "30000")

kafkaParams.put("enable.auto.commit", "false")

kafkaParams.put("max.poll.records", "100")

kafkaParams.put("kafka.topics", "spark_streaming_test") --kafka topic

kafkaParams.put("group.id", "g_spark_test")        --消费者所属消费组

val zkHost = "min01:2181,min02:2181,min03:2181"

val sessionTimeout = 120000

val connectionTimeout = 60000

val zkClient = ZKUtil.initZKClient(zkHost, sessionTimeout, connectionTimeout) --初始化 zookeeper 所需要的参数

val zkTopic = "spark_streaming_test" --

val zkConsumerGroupId = "g_spark_test"          --kafka有多少分区,zookeeper里面就会有多少文件,言外之意,就是为kafka里面的每个partition单独有一个zk文件用来保存offset偏移量

val zkTopicDir = new ZKGroupTopicDirs(zkConsumerGroupId, zkTopic) --这行连着下行作用是得到偏移量信息的存储目录

val zkTopicPath = zkTopicDir.consumerOffsetDir

val childrenCount = zkClient.countChildren(zkTopicPath) --得到偏移量目录下有多少文件

var kafkaStream: InputDStream[(String, String)] = null --会把kafka的数据流转化为 Dstream

var fromOffsets: Map[TopicAndPartition, Long] = Map() --用于确定本次消费的消息从何处开始

kafkaStream = if (childrenCount > 0) { --如果对应zk目录下没有存在偏移量文件则根据KafkaUtils.createDirectStream方法创建Dstream

val req = new TopicMetadataRequest(topics.toList, 0) --通过向kafka发送一个TopicMetadataRequest实例,得到kafka指定主题各个分区的状态

val leaderConsumer = new SimpleConsumer("min01", 9092, 10000, 10000, "StreamingOffsetObserver")

val res = leaderConsumer.send(req)

val topicMetaOption = res.topicsMetadata.headOption

val partitions = topicMetaOption match {

case Some(tm) =>

tm.partitionsMetadata.map(pm => (pm.partitionId, pm.leader.get.host)).toMap[Int, String]

case None =>

Map[Int, String]()

}

for (partition

val partitionOffset = zkClient.readData[String](zkTopicPath + "/" + partition)

val tp = TopicAndPartition(kafkaParams("kafka.topics"), partition)

val requestMin = OffsetRequest(Map(tp -> PartitionOffsetRequestInfo(OffsetRequest.EarliestTime, 1)))

val consumerMin = new SimpleConsumer(partitions(partition), 9092, 10000, 10000, "getMinOffset")

val curOffsets = consumerMin.getOffsetsBefore(requestMin).partitionErrorAndOffsets(tp).offsets

var nextOffset = partitionOffset.toLong

if (curOffsets.nonEmpty && nextOffset < curOffsets.head) {

nextOffset = curOffsets.head

}

fromOffsets += (tp -> nextOffset)

}

val messageHandler = (mam: MessageAndMetadata[String, String]) => (mam.key, mam.message)

KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParams.toMap, fromOffsets, messageHandler)

} else {

KafkaUtils.createDirectStream[

String,

String,

StringDecoder,

StringDecoder](ssc, kafkaParams.toMap, topics)

}

var offsetRanges: Array[OffsetRange] = null

val kafkaInputDStream = kafkaStream.transform { rdd => {

offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges

rdd

}

}

val kafkaValues = kafkaInputDStream.map(_._2)

val kafkaSplits = kafkaValues.map(_.split(",")).filter(_.length == 4)

val results =kafkaSplits.map(_.mkString(","))

results.foreachRDD(rdd => {

//在Driver端执行

rdd.foreachPartition(p => {

//在Worker端执行

//如果将输出结果保存到某个数据库,可在此处实例化数据库的连接器

p.foreach(result => {

val car = result.split(",")(0)

val longitude = result.split(",")(1)

val latitude = result.split(",")(2)

val timestamp = result.split(",")(3)

val conn=ConnectionPool.getConnection()

// val sql = "INSERT INTO syllabus.t_car_position (plate_num,longitude,latitude,timestamp ) values (?,?,?,? )"

// val sql = "INSERT INTO syllabus.people (id,name,area,sex ) values (?,?,?,? )"

// val sql = "INSERT INTO syllabus.keshi(time01,name01,count01,sign01 ) values (?,?,?,? )" --问号是指占位符,在sql创建的时候未被指定,通过与PreparedStatemen接口整合来传递数据,(PreparedStatemen要使用setstring方法来为占位符提供数据)

// val sql = "INSERT INTO syllabus.area(areaid,name,jing,wei) values (?,?,?,? )"

//

// val statement: PreparedStatement = conn.prepareStatement(sql)             --接口继承Statement,里面包含已编译的语句

// statement.setString(1,car)

// statement.setString(2,longitude)

// statement.setString(3,latitude)

// statement.setString(4,timestamp)

//

// statement.addBatch() --批处理调交

// statement.executeBatch() --批处理更新,通常要关闭sql自动提交,防止JDBC进行事务处理。(好处是在发生事务处理异常的时候由操作者决定要不要进行事务处理)

conn.commit()

ConnectionPool.returnConnection(conn)

println(result)

})

})

//ZkUtils不可序列化,所以需要在Driver端执行

for (o

ZkUtils.updatePersistentPath(zkClient, zkTopicDir.consumerOffsetDir + "/" + {

o.partition

}, o.fromOffset.toString)

println("本次消息消费成功后,偏移量状态:" + o)

}

})

ssc.start()

ssc.awaitTermination()

}

}

注: pom信息

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
这个问题的意思是如何使用b'sparkstreaming\xe3\x80\x8a\xe4\xb8\x89\xe3\x80\x8b'来读取Kafka数据,并将增量存储在MySQL中。 首先需要使用Spark StreamingKafka Direct API来读取Kafka数据,然后将获得的数据转换为DataFrame或RDD。接下来将增量数据存储到MySQL中,可以使用Spark SQL或Dataframe APIs来实现。代码示例如下: ``` from pyspark.sql import SparkSession from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils from pyspark.sql.functions import * # 创建SparkSession spark = SparkSession.builder.appName("KafkaToMySQL").getOrCreate() # 创建StreamingContext ssc = StreamingContext(spark.sparkContext, batchDuration=5) # 设置Kafka参数 kafkaParams = {"bootstrap.servers": "localhost:9092", "group.id": "testGroup"} # 创建Kafka direct stream kafkaStream = KafkaUtils.createDirectStream(ssc, topics=["testTopic"], kafkaParams=kafkaParams) # 处理Kafka数据,并保存MySQL def processBatch(batchTime, rdd): if not rdd.isEmpty(): # 转换为DataFrame df = spark.read.json(rdd) # 将时间戳转换为日期 df = df.withColumn("date", from_unixtime(col("timestamp"), "yyyy-MM-dd")) # 计算增量 incremental_data = df.groupBy("date").agg(sum("value").alias("incremental_value")) # 将增量数据写入MySQL incremental_data.write.format("jdbc").option("url", "jdbc:mysql://localhost/test").option("dbtable", "incremental_data").option("user", "root").option("password", "root").mode("append").save() # 处理每个批次 kafkaStream.foreachRDD(processBatch) # 启动StreamingContext ssc.start() ssc.awaitTermination() ```
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值