Spark实时ETL·中国移动充值活动交易额指标统计
1、充值活动交易额指标统计
1.1 ODS实时分流
1.1.1 需求
- 使用Canal抓取MySQL的Binlog日志,并使用Spark Streaming进行ODS实时分流,使MySQL的每张业务表对应Kafka相应的Topic
1.1.2 Canal配置
server_id=1
log-bin=mysql-bin
binlog_format=row
binlog-do-db=gmall-2020-07
CREATE USER canal IDENTIFIED BY 'canal';
GRANT SELECT, SHOW VIEW, REPLICATION SLAVE, REPLICATION CLIENT ON *.* TO 'canal'@'%';
canal.port = 11111
canal.zkServers =
canal.serverMode = kafka
canal.mq.servers = hodoop1:9092
canal.instance.master.address=hodoop1:3306
canal.mq.topic=ODS_DB_GMALL1122_C
1.1.3 Spark Streaming 实时分流
object BaseDBCanalApp {
def main(args: Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf().setAppName("base_db_canal_app").setMaster("local[*]")
val ssc = new StreamingContext(sparkConf, Seconds(3))
val topic = "ODS_DB_GMALL1122_C";
val groupId = "base_db_canal_group"
val offset: Map[TopicPartition, Long] = OffsetManager.getOffset(groupId, topic)
var inputDstream: InputDStream[ConsumerRecord[String, String]] = null
if (offset != null && offset.size != 0) {
inputDstream = MyKafkaUtil.getKafkaStream(topic, ssc, offset, groupId)
} else {
inputDstream = MyKafkaUtil.getKafkaStream(topic, ssc, groupId)
}
var offsetRanges: Array[OffsetRange] = null
val inputGetOffsetDstream: DStream[ConsumerRecord[String, String]] = inputDstream.transform(rdd => {
offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd
})
val dbJsonObjDstream: DStream[JSONObject] = inputGetOffsetDstream.map(
record => {
val jsonString: String = record.value()
val jsonObj: JSONObject = JSON.parseObject(jsonString)
jsonObj
}
)
dbJsonObjDstream.foreachRDD {
rdd => {rdd.foreachPartition { josnObjItr => {
for (jsonObj <- josnObjItr) {
val dataArr: JSONArray = jsonObj.getJSONArray("data")
for (i <- 0 until dataArr.size()) {
val dataJsonObj: JSONObject = dataArr.getJSONObject(i)
val topic = "ODS_T_" + jsonObj.getString("table").toUpperCase
val id: String = dataJsonObj.getString("id")
MyKafkaSink.send(topic, id, dataJsonObj.toJSONString)
}
}
}
}
}
OffsetManager.saveOffset(groupId, topic, offsetRanges)
}
ssc.start()
ssc.awaitTermination()
}
}
1.2 Stream-Static Join
1.2.1 需求
- 使用Spark Streaming消费Kafka数据,与MySQL中的活动日期维表进行关联,合成营销主题宽表,并写入MySQL,供短信营销服务使用
1.2.2 Stream-Topic
字段 | 描述 |
---|
user_id | 用户ID |
user_level | 用户等级 |
tag_phone | 营销手机 |
submit_time | 充值时间 |
money | 充值金额 |
1.2.3 Static维表
字段 | 描述 |
---|
id | 自增ID |
active_id | 活动ID |
active_name | 活动名称 |
user_level | 目标用户 |
create_time | 活动开始时间 |
deadline_time | 活动截止时间 |
active_describe | 活动描述 |
1.2.4 Spark Streaming 流-维 Join
package com.bigData.spark
import com.alibaba.fastjson.{JSON, JSONException, JSONObject}
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Durations, StreamingContext}
case class ActiveBean(user_level:String,create_time:String,deadline_time:String,active_describe:String)
object StreamStaicJoin {
def main(args: Array[String]): Unit = {
Logger.getLogger("org").setLevel(Level.WARN)
val kafkaParams= Map[String, Object](
"bootstrap.servers" -> "localhost:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (true: java.lang.Boolean),
"group.id" -> "ODS_T_charge")
val sparkConf = new SparkConf().setAppName(this.getClass.getSimpleName.replace("$","")).setMaster("local[3]")
val ssc = new StreamingContext(sparkConf,Durations.seconds(10))
val userInfo=ssc.sparkContext.parallelize(Array(
ActiveBean("DDAEF_20200501","劳动节充值20减5","5","2020/05/01 00:00:00","2020/05/01 23:59:59","公司为庆祝五一劳动节,开展充值20减5,大家快来啊"),
ActiveBean("AABBC_20200721","三周年店庆全场8.5折","2","2020/07/21 00:00:00","2020/07/28 23:59:59","公司为成立三周年,全场大酬宾全场8.5折,大家快来买啊")
)).map(active=>(active.user_level,active))
val kafkaDStream=KafkaUtils.createDirectStream[String,String](
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String,String](Set("testTopic3"),kafkaParams)
).map(item=>parseJson(item.value())).map(item=>{
val user_level = item.getString("user_level")
val eventTime = item.getString("submit_time")
val money= item.getInteger("money")
val tag_phone = item.getInteger("tag_phone")
(user_level,(eventTime,money,tag_phone))
})
kafkaDStream.foreachRDD(_.join(userInfo).foreach(println))
ssc.start()
ssc.awaitTermination()
}
def parseJson(log:String):JSONObject={
var ret:JSONObject=null
try{
ret=JSON.parseObject(log)
}catch {
case e:JSONException => println(log)
}
ret
}
}
1.3 活动指标统计
1.3.1 需求
1.3.2 实现
val orderInfoDstream: DStream[OrderInfo] = inputGetOffsetDstream.map { record =>
val jsonString: String = record.value()
val orderInfo: OrderInfo = JSON.parseObject(jsonString,classOf[OrderInfo])
val datetimeArr: Array[String] = orderInfo.create_time.split(" ")
orderInfo.create_date=datetimeArr(0)
val timeArr: Array[String] = datetimeArr(1).split(":")
orderInfo.create_hour=timeArr(0)
orderInfo
}
val orderInfoWithfirstDstream: DStream[OrderInfo] = orderInfoDstream.mapPartitions { orderInfoItr =>
val orderInfoList: List[OrderInfo] = orderInfoItr.toList
if(orderInfoList.size>0){
val userIdList: List[String] = orderInfoList.map(_.user_id.toString)
var sql = "select user_id,if_consumed from user_state1122 where user_id in ('" + userIdList.mkString("','") + "')"
val userStateList: List[JSONObject] = PhoenixUtil.queryList(sql)
val userStateMap: Map[String, String] = userStateList.map(userStateJsonObj =>
(userStateJsonObj.getString("USER_ID"), userStateJsonObj.getString("IF_CONSUMED"))
).toMap
for (orderInfo <- orderInfoList) {
val userIfConsumed: String = userStateMap.getOrElse(orderInfo.user_id.toString, null)
if (userIfConsumed != null && userIfConsumed == "1") {
orderInfo.if_first_order = "0"
} else {
orderInfo.if_first_order = "1"
}
}
}
orderInfoList.toIterator
}
1.4 数据大屏指标统计
1.4.1 需求
- 使用Spark Streaming实时统计每分钟的充值金额和订单量,并写入Redis
1.4.2 实现
val provinceBroadcast = "100=\"北京\"\n 200=\"广东\"\n 210=\"上海\"\n 220=\"天津\"\n 230=\"重庆\"\n 240=\"辽宁\"\n 250=\"江苏\"\n 270=\"湖北\"\n 280=\"四川\"\n 290=\"陕西\"\n 311=\"河北\"\n 351=\"山西\"\n 371=\"河南\"\n 431=\"吉林\"\n 451=\"黑龙江\"\n 471=\"内蒙古\"\n 531=\"山东\"\n 551=\"安徽\"\n 571=\"浙江\"\n 591=\"福建\"\n 731=\"湖南\"\n 771=\"广西\"\n 791=\"江西\"\n 851=\"贵州\"\n 871=\"云南\"\n 891=\"西藏\"\n 898=\"海南\"\n 931=\"甘肃\"\n 951=\"宁夏\"\n 971=\"青海\"\n 991=\"新疆\""
val province_Map = provinceBroadcast.split("\n ").map(s=>(s.split("=")(0),s.split("=")(1))).toMap
val rowRDD = lines.map(obj=>{
val jsonObj = JSON.parseObject(obj)
val bussinessRst = jsonObj.getString("bussinessRst")
val fee = if("0000".equals(bussinessRst))
jsonObj.getDouble("chargefee")
else
0.0
val isSucc = if("0000".equals(bussinessRst)) 1 else 0
val starttime = jsonObj.getString("requestId")
val receivetime = jsonObj.getString("receiveNotifyTime")
val pcode = jsonObj.getString("provinceCode")
val province = province_Map.get(pcode).toString
val costtime = if("0000".equals(bussinessRst)) TimeUtils.costtime(starttime,receivetime) else 0
(
starttime.substring(0,10),
starttime.substring(0,12),
(1,isSucc,fee,costtime),
province
)
})
def kpi_general(baseData: RDD[(String, String, List[Double], String)]): Unit = {
baseData.map(tp => (tp._1, tp._2)).reduceByKey((list1, list2) => {
list1.zip(list2).map(tp => tp._1 + tp._2)
})
.foreachPartition(partition => {
val jedis = Jpools.getJedis
partition.foreach(tp => {
jedis.hincrBy("A-" + tp._1, "total", tp._2(0).toLong)
jedis.hincrBy("A-" + tp._1, "succ", tp._2(1).toLong)
jedis.hincrByFloat("A-" + tp._1, "money", tp._2(2))
jedis.hincrBy("A-" + tp._1, "cost", tp._2(3).toLong)
jedis.expire("A-" + tp._1, 48 * 60 * 60)
})
jedis.close()
})
}