文章目录
一、实时统计注册人员信息
1.需求
用户使用网站或APP进行注册,后台实时收集数据传输Kafka,Spark Streaming进行对接统计,实时统计注册人数。
需求1:要求Spark Streaming 保证数据不丢失,每秒1000条处理速度,需要手动维护偏移量
需求2:实时统计注册人数,批次为3秒一批,使用updateStateBykey算子计算历史数据和当前批次的数据总数,仅此需求使用updateStateBykey,后续需求不使用updateStateBykey。
需求3:每6秒统统计一次1分钟内的注册数据,不需要历史数据 提示:reduceByKeyAndWindow算子
需求4:观察对接数据,尝试进行调优。
2.代码实现
import java.lang
import java.sql.ResultSet
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.{
Seconds, StreamingContext}
import org.apache.spark.{
SparkConf, SparkContext}
import scala.collection.mutable
object RegisterStreaming {
private val groupid = "register_group"
def main(args: Array[String]): Unit = {
System.setProperty("HADOOP_USER_NAME", "aaa")
val conf = new SparkConf().setAppName(this.getClass.getSimpleName)
.set("spark.streaming.kafka.maxRatePerPartition", "100")
// .set("spark.streaming.backpressure.enabled", "true")
.set("spark.streaming.stopGracefullyOnShutdown", "true")
.setMaster("local[*]")
val ssc = new StreamingContext(conf, Seconds(3))
val sparkContext: SparkContext = ssc.sparkContext
val topics = Array("register_topic")
val kafkaMap: Map[String, Object] = Map[String, Object](
"bootstrap.servers" -> "hadoop102:6667,hadoop103:6667,hadoop104:6667",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> groupid,
"auto.offset.reset" -> "earliest", //sparkstreaming第一次启动,不丢数
//如果是true,则这个消费者的偏移量会在后台自动提交,但是kafka宕机容易丢失数据
//如果是false,则需要手动维护kafka偏移量
"enable.auto.commit" -> (false: lang.Boolean)
)
sparkContext.hadoopConfiguration.set("fs.defaultFS", "hdfs://nameservice1")
sparkContext.hadoopConfiguration.set("dfs.nameservices", "nameservice1")
//sparkStreaming对有状态的数据操作,需要设定检查点目录,然后将状态保存到检查点中
ssc.checkpoint("/user/aa/sparkstreaming/checkpoint")
//查询mysql中是否有偏移量
val sqlProxy = new SqlProxy()
val offsetMap = new mutable.HashMap[TopicPartition, Long]()
val client = DataSourceUtil.getConnection
try {
sqlProxy.executeQuery(client, "select * from `offset_manager` where groupid=?", Array(groupid), new QueryCallback {
override def process(rs: ResultSet): Unit = {
while (rs.next()) {
val model = new TopicPartition(rs.getString(2), rs.getInt(3))
val offset = rs.getLong(4)
offsetMap.put(model, offset)
}
rs.close() //关闭游标
}
})
} catch {
case e: Exception => e.printStackTrace()
} finally {
sqlProxy.shutdown(client)
}
//设置kafka消费数据的参数 判断本地是否有偏移量 有则根据偏移量继续消费 无则重新消费
val stream: InputDStream[ConsumerRecord[String, String]] = if (offsetMap.isEmpty) {
KafkaUtils.createDirectStream(
ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](topics, kafkaMap))
} else {
KafkaUtils.createDirectStream(
ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](topics, kafkaMap, offsetMap))
}
//stream原始流无法进行使用和打印,会报序列化错误,所以需要做下面的map转换
val resultDStream = stream.filter(item => item.value().split("\t").length == 3).
mapPartitions(partitions => {
partitions.map(item => {
val line = item.value()
val arr = line.split("\t")
val app_name = arr(1) match {
case "1" => "PC"
case "2" => "APP"
case _ => "Other"
}
(app_name, 1)
})
})
resultDStream.cache()
//(PC,1),(PC,1),(APP,1),(Other,1),(APP,1),(Other,1),(PC,1),(APP,1)
//"=================每6s间隔1分钟内的注册数据================="
resultDStream.reduceByKeyAndWindow((x: Int, y: Int) => x + y, Seconds(60), Seconds(6)).print()
//"========================================================="
//"+++++++++++++++++++++++实时注册人数+++++++++++++++++++++++"//状态计算
val updateFunc = (values: Seq[Int], state: Option[Int]) => {
val currentCount = values.sum //本批次求和
val previousCount = state.getOrElse(0) //历史数据
Some(currentCount + previousCount)
}
resultDStream.updateStateByKey(updateFunc).print()
//"++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
// val dsStream = stream.filter(item => item.value().split("\t").length == 3)
// .mapPartitions(partitions =>
// partitions.map(item => {
// val rand = new Random()
// val line = item.value()
// val arr = line.split("\t")
// val app_id = arr(1)
// (rand.nextInt(3) + "_" + app_id, 1)
// }))
// val result = dsStream.reduceByKey(_ + _)
// result.map(item => {
// val appid = item._1.split("_")(1)
// (appid, item._2)
// }).reduceByKey(_ + _).print()
//处理完 业务逻辑后 手动提交offset维护到本地 mysql中
stream.foreachRDD(rdd => {
val sqlProxy = new SqlProxy()
val client = DataSourceUtil.getConnection
try {
val offsetRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
for (or <- offsetRanges) {
sqlProxy.executeUpdate(client, "replace into `offset_manager` (groupid,topic,`partition`,untilOffset) values(?,?,?,?)",
Array(groupid, or.topic, or.partition.toString, or.untilOffset))
}
} catch {
case e: Exception => e.printStackTrace()
} finally {
sqlProxy.shutdown(client)
}
})
ssc.start()
ssc.awaitTermination()
}
}
二、实时计算做题正确率与知识点掌握度
1.需求
用户在网站或APP进行做题,做完题点击交卷按钮,程序将做题记录提交,传输到Kafka中,下游Spark Streaming对接kafka实现实时计算做题正确率和掌握度,将正确率和掌握度存入mysql中,用户点击交卷后刷新页面能立马看到自己做题的详情。
需求1:要求Spark Streaming 保证数据不丢失,每秒1000条处理速度,需要手动维护偏移量
需求2:同一个用户做在同一门课程同一知识点下做题需要去重,并且需要记录去重后的做题id与个数。
需求3:计算知识点正确率 正确率计算公式:做题正确总个数/做题总数 保留两位小数
需求4:计算知识点掌握度 去重后的做题个数/当前知识点总题数(已知30题)*当前知识点的正确率
2.代码实现
import java.lang
import java.sql.{
Connection, ResultSet}
import java.time.LocalDateTime
import java.time.format.DateTimeFormatter
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.{
Seconds, StreamingContext}
import scala.collection.mutable
/**
* 知识点掌握度实时统计
*/
object QzPointStreaming {
private val groupid = "qz_point_group"
val map = new mutable.HashMap[String, LearnModel]()
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName(this.getClass.getSimpleName)
.set("spark.streaming.kafka.maxRatePerPartition", "100")
.set("spark.streaming.backpressure.enabled", "true")
// .set("spark.streaming.stopGracefullyOnShutdown", "true")
.setMaster("local[*]")
val ssc = new StreamingContext(conf, Seconds(3))
val topics = Array("qz_log")
val kafkaMap: Map[String, Object] = Map[String, Object](
"bootstrap.servers" -> "hadoop102:6667,hadoop103:6667,hadoop104:6667",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> groupid,
"auto.offset.reset" -> "earliest",
"enable.auto.commit" -> (false: lang.Boolean)
)
//查询mysql中是否存在偏移量
val sqlProxy = new SqlProxy()
val offsetMap = new mutable.HashMap[TopicPartition, Long]()
val client = DataSourceUtil.getConnection
try {
sqlProxy.executeQuery(client, "select * from `offset_manager` where groupid=?", Array(groupid), new QueryCallback {
override def process(rs: ResultSet): Unit = {
while (rs.next()) {
val model = new TopicPartition(rs.getString(2), rs.getInt(3))
val offset = rs.getLong(4)
offsetMap.put(model, offset)
}
rs.close() //关闭游标
}
})
} catch {
case e: Exception => e.printStackTrace()
} finally {
sqlProxy.shutdown(client)
}
//设置kafka消费数据的参数 判断本地是否有偏移量 有则根据偏移量继续消费 无则重新消费
val stream: InputDStream[ConsumerRecord[String, String]] = if (offsetMap.isEmpty) {
KafkaUtils.createDirectStream(
ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](topics, kafkaMap))
} else {
KafkaUtils.createDirectStream(
ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](topics, kafkaMap, offsetMap))
}