ods层-ProducerClientLog
def main(args: Array[String]): Unit = {
if (args.length == 0) {
println("请输入日期")
System.exit(1) // 程序终止
}
var spark: SparkSession = null
if(ConfigUtils.LOCAL_RUN) {
spark = SparkSession
.builder()
.master("local[*]")
.appName(this.getClass.getName)
.config("hive.metastore.uris",ConfigUtils.HIVE_METASTORE_URIS)
.enableHiveSupport()
.getOrCreate()
}else {
spark = SparkSession
.builder()
.master("yarn")
.appName(this.getClass.getName)
.config("hive.metastore.uris", ConfigUtils.HIVE_METASTORE_URIS)
.enableHiveSupport()
.getOrCreate()
}
spark.sparkContext.setLogLevel("WARN")
val rdd1: RDD[String] = spark.sparkContext.textFile(s"${ConfigUtils.HDFS_CLIENT_LOG_PATH}/${args(0)}/currentday_clientlog.tar.gz")
val rdd2: RDD[(String, String)] = rdd1.map(v => v.split("&")).filter(v => v.length == 6).map(v => (v(2), v(3)))
//过滤,要求拆分后,数组长度为6,将2,3字段取出成为二元组
rdd2.map(v => {
var eventName = v._1 //MINIK_CLIENT_SONG_PLAY_OPERATE_REQ
var eventJson = v._2 //{"songid": "LX_U016464", "mid": 96596, "optrate_type": 3, }
//如果v._1中的数据是MINIK_CLIENT_SONG_PLAY_OPERATE_REQ ,即为我们要使用的数据,然后将json数据拆分开来,
if ("MINIK_CLIENT_SONG_PLAY_OPERATE_REQ".equals(v._1)) {
val jSONObject: JSONObject = JSON.parseObject(eventJson)
//获取json中每个标签的数据
val songid: String = jSONObject.getString("songid")
val mid: String = jSONObject.getString("mid")
val optrate_type: String = jSONObject.getString("optrate_type")
val uid: String = jSONObject.getString("uid")
val consume_type: String = jSONObject.getString("consume_type")
val dur_time: String = jSONObject.getString("dur_time")
val session_id: String = jSONObject.getString("session_id")
val songname: String = jSONObject.getString("songname")
val pkg_id: String = jSONObject.getString("pkg_id")
val order_id: String = jSONObject.getString("order_id")
//按照指定格式将数据存入HDFS中,此处用“\t”作间隔符,方便将数据放入数仓中,---数仓建表时要求的间隔符与此处一致
(eventName, songid + "\t" + mid + "\t" + optrate_type + "\t" + uid + "\t"
+ consume_type + "\t" + dur_time + "\t" + session_id + "\t" + songname + "\t" + pkg_id + "\t" + order_id)
//其它数据可以用作其它用途,此处不做任何处理
} else if ("MINIK_CLIENT_ADVERTISEMENT_RECORD".equals(v._1)) {
//do something
(eventName, "ceshi")
} else {
v
}
}).saveAsHadoopFile(s"${ConfigUtils.HDFS_CLIENT_LOG_PATH}/${args(0)}/all_tables",
classOf[String], classOf[String], classOf[PairRDDMultipleTextOutputFormat])
//.saveAsHadoopFile的参数,(文件保存位置,key的类型,v的类型,文件名格式)
//sql语句处理,将数据导入表中
spark.sql("use songdb")
val sql =
s"""
|load data inpath '${ConfigUtils.HDFS_CLIENT_LOG_PATH}/${args(0)}/all_tables/MINIK_CLIENT_SONG_PLAY_OPERATE_REQ'
|into table TO_CLIENT_SONG_PLAY_OPERATE_REQ_D partition (data_dt ='${args(0)}')
|""".stripMargin
spark.sql(sql)
spark.close()
}
EDS层user-GenerateTwCnsmBriefD
object GenerateTwCnsmBriefD {
def main(args: Array[String]): Unit = {
if(args.size==0){
println("请输入日期:")
System.exit(1)
}
val inputdate = args(0)
var spark: SparkSession = null
if(ConfigUtils.LOCAL_RUN){
spark = SparkSession
.builder()
.appName(this.getClass.getName)
.master("local[*]")
.config("hive.metastore.uris", ConfigUtils.HIVE_METASTORE_URIS) //metastore地址
.config("spark.sql.shuffle.partition", 2)
.enableHiveSupport()
.getOrCreate()
}else {
spark = SparkSession
.builder()
.appName(this.getClass.getName)
.master("yarn")
.config("hive.metastore.uris", ConfigUtils.HIVE_METASTORE_URIS) //metastore地址
.config("spark.sql.shuffle.partition", 2)
.enableHiveSupport()
.getOrCreate()
}
spark.sparkContext.setLogLevel("WARN")
spark.sql("use songdb")
spark.sql(
s"""
|insert overwrite table TW_CNSM_BRIEF_D partition(data_dt = '${inputdate}')
|select
|ID ,
|TRD_ID ,
|UID ,
|MID ,
|PRDCD_TYPE ,
|PAY_TYPE ,
|ACT_TM ,
|PKG_ID ,
|AMT COIN_PRC , --金额
|1 COIN_CNT , --数量
|ACT_TM UPDATE_TM , --更新时间
|ORDR_ID ,
|ACTV_NM ,
|PKG_PRC ,
|PKG_DSCNT ,
|CPN_TYPE ,
|case when ORDR_TYPE =1 then 0
| when ORDR_TYPE =2 then 1
| when ORDR_TYPE =3 or ORDR_TYPE = 4 then 2 end ABN_TYP
|from TO_YCAK_CNSM_D where data_dt = '${inputdate}'
|""".stripMargin)
spark.close()
}
}
eds层user - GenerateTwUsrBaseinfoD
object GenerateTwUsrBaseinfoD {
def main(args: Array[String]): Unit = {
if(args.size==0){
println("请输入日期:")
System.exit(1)
}
var inputdate = args(0)
var spark: SparkSession = null
if(ConfigUtils.LOCAL_RUN){
spark = SparkSession
.builder()
.appName(this.getClass.getName)
.master("local[*]")
.config("hive.metastore.uris", ConfigUtils.HIVE_METASTORE_URIS) //metastore地址
.config("spark.sql.shuffle.partition", 2)
.enableHiveSupport()
.getOrCreate()
}else {
spark = SparkSession
.builder()
.appName(this.getClass.getName)
.master("yarn")
.config("hive.metastore.uris", ConfigUtils.HIVE_METASTORE_URIS) //metastore地址
.config("spark.sql.shuffle.partition", 2)
.enableHiveSupport()
.getOrCreate()
}
spark.sql("use songdb")
//微信
val WXUser: DataFrame = spark.sql(
"""
|select
|UID,
|REG_MID,
|"1" REG_CHNL,
|WX_ID REF_UID,
|GDR,
|BIRTHDAY,
|MSISDN,
|LOC_ID,
|LOG_MDE,
|SUBSTRING(reg_tm,0,8) REG_DT,
|SUBSTRING(reg_tm,9,6) REG_TM,
|USR_EXP,
|SCORE,
|LEVEL,
|null USR_TYPE,
|null IS_CERT,
|null IS_STDNT
|from TO_YCAK_USR_D
|""".stripMargin)
//支付宝
val AliUser: DataFrame = spark.sql(
"""
|select
|UID,
|REG_MID,
|"2" REG_CHNL,
|ALY_ID REF_UID,
|GDR,
|BIRTHDAY,
|MSISDN,
|LOC_ID,
|LOG_MDE,
|SUBSTRING(reg_tm,0,8) REG_DT,
|SUBSTRING(reg_tm,9,6) REG_TM,
|USR_EXP,
|SCORE,
|LEVEL,
|USR_TYPE USR_TYPE,
|IS_CERT IS_CERT,
|IS_STDNT IS_STDNT
|from TO_YCAK_USR_ALI_D
|
|""".stripMargin)
//qq
val QQUser: DataFrame = spark.sql(
"""
|select
|UID,
|REG_MID,
|"3" REG_CHNL,
|QQID REF_UID,
|GDR,
|BIRTHDAY,
|MSISDN,
|LOC_ID,
|LOG_MDE,
|SUBSTRING(reg_tm,0,8) REG_DT,
|SUBSTRING(reg_tm,9,6) REG_TM,
|USR_EXP,
|SCORE,
|LEVEL,
|null USR_TYPE,
|null IS_CERT,
|null IS_STDNT
|from TO_YCAK_USR_QQ_D
|""".stripMargin)
// APP
val APPUser: DataFrame = spark.sql(
"""
|select
|UID,
|REG_MID,
|"4" REG_CHNL,
|APP_ID REF_UID,
|GDR,
|BIRTHDAY,
|MSISDN,
|LOC_ID,
|null LOG_MDE,
|SUBSTRING(reg_tm,0,8) REG_DT,
|SUBSTRING(reg_tm,9,6) REG_TM,
|USR_EXP,
|null SCORE,
|LEVEL,
|null USR_TYPE,
|null IS_CERT,
|null IS_STDNT
|from TO_YCAK_USR_APP_D
|""".stripMargin)
//创建微信、支付宝、QQ、APP四个表union的的用户表
WXUser.union(AliUser).union(QQUser).union(APPUser).createTempView("user")
spark.sql(
s"""
|insert overwrite table TW_USR_BASEINFO_D
|partition (data_dt='${inputdate}')
|select
|t1.UID,
|t2.REG_MID,
|t2.REG_CHNL,
|t2.REF_UID,
|t2.GDR,
|t2.BIRTHDAY,
|t2.MSISDN,
|t2.LOC_ID,
|t2.LOG_MDE,
|t2.REG_DT,
|t2.REG_TM,
|t2.USR_EXP,
|t2.SCORE,
|t2.LEVEL,
|t2.USR_TYPE,
|t2.IS_CERT,
|t2.IS_STDNT
|from (
|select distinct uid
|from TO_YCAK_USR_LOGIN_D
|where data_dt = '${inputdate}'
|) t1 left join user t2 on t1.uid = t2.uid
|
|""".stripMargin)
//七日连续登录用户
val pre7day = DateUtils.getCurrentDatePreDate(inputdate,7)
val df: DataFrame = spark.sql(
s"""
|
|
|select
|t1.uid,
|case when t2.REG_CHNL = 1 then '微信'
| when t2.REG_CHNL = 2 then '支付宝'
| when t2.REG_CHNL = 3 then 'QQ'
| when t2.REG_CHNL = 4 then 'APP'
| else '未知' end REG_CHNL,
|t2.REF_UID,
|case when t2.GDR = 1 then '男'
| when t2.GDR = 2 then '女'
| else '不明' end GDR,
|t2.BIRTHDAY,
|t2.MSISDN,
|t2.REG_DT,
|t2.LEVEL
|from (
|select uid ,count(*) c1
|from TW_USR_BASEINFO_D
|where data_dt BETWEEN '${pre7day}' and '${inputdate}'
|group by uid
|having c1 = 1 -- 此处要求是7天内连续登录的用户,因为表中只有一天的数据,所以c1=1,若表中有7天的数据,应该等于7
|) t1 left join TW_USR_BASEINFO_D t2 on t1.uid = t2.uid
|""".stripMargin)
var pro = new Properties()
pro.put("user",ConfigUtils.MYSQL_USER)
pro.put("password",ConfigUtils.MYSQL_PASSWORD)
df.write.mode(SaveMode.Overwrite).jdbc(ConfigUtils.MYSQL_URL,"user_7days_active",pro)
spark.close()
}
}
eds层machine - GenerateTwMacBaseinfoD
object GenerateTwMacBaseinfoD {
def main(args: Array[String]): Unit = {
if (args.size == 0) {
println("请输入日期:")
System.exit(1)
}
var inputdate = args(0)
var spark: SparkSession = null
if (ConfigUtils.LOCAL_RUN) {
spark = SparkSession
.builder()
.appName(this.getClass.getName)
.master("local[*]")
.config("hive.metastore.uris", ConfigUtils.HIVE_METASTORE_URIS) //metastore地址
.config("spark.sql.shuffle.partition", 2)
.enableHiveSupport()
.getOrCreate()
} else {
spark = SparkSession
.builder()
.appName(this.getClass.getName)
.master("yarn")
.config("hive.metastore.uris", ConfigUtils.HIVE_METASTORE_URIS) //metastore地址
.config("spark.sql.shuffle.partition", 2)
.enableHiveSupport()
.getOrCreate()
}
spark.udf.register("getProvince", (addr: String) => {
var pro = ""
try {
if (addr.contains("内蒙古自治区")) {
pro = "内蒙古"
} else if (addr.contains("宁夏回族自治区")) {
pro = "宁夏"
} else if (addr.contains("西藏自治区")) {
pro = "西藏"
} else if (addr.contains("广西壮族自治区")) {
pro = "广西"
} else if (addr.contains("新疆维吾尔自治区")) {
pro = "新疆"
} else if (addr.contains("北京市")) {
pro = "北京"
} else if (addr.contains("上海市")) {
pro = "上海"
} else if (addr.contains("重庆市")) {
pro = "重庆"
} else if (addr.contains("天津市")) {
pro = "天津"
} else if (addr.contains("省")) {
pro = addr.substring(0, addr.indexOf("省"))
} else if (addr.isEmpty || addr.equals("")) {
pro = addr
}
} catch {
case e: Exception => {
println("Provice :" + addr + "\n" + "错误原因" + e.getMessage)
}
}
pro
})
spark.udf.register("getCity", (addr: String) => {
var city = ""
try {
if (addr.contains("内蒙古自治区")) {
if (addr.contains("阿拉善盟")) {
city = "阿拉善盟"
} else {
city = addr.substring(addr.indexOf("区") + 1, addr.indexOf("市"))
}
} else if (addr.contains("宁夏回族自治区")) {
city = addr.substring(addr.indexOf("区") + 1, addr.indexOf("市"))
} else if (addr.contains("西藏自治区")) {
city = addr.substring(addr.indexOf("区") + 1, addr.indexOf("市"))
} else if (addr.contains("广西壮族自治区")) {
city = addr.substring(addr.indexOf("区") + 1, addr.indexOf("市"))
} else if (addr.contains("新疆维吾尔自治区")) {
if (addr.contains("市")) {
city = addr.substring(addr.indexOf("区") + 1, addr.indexOf("市"))
} else if (addr.contains("州")) {
city = addr.substring(addr.indexOf("区") + 1, addr.indexOf("州"))
} else {
val index = addr.indexOf("区")
city = addr.substring(index + 1, addr.indexOf("区", index + 1))
//addr.indexOf("区",index+1) 表示从index+1位置开始检索,即检索到第二个区的下表
}
} else if (addr.contains("西藏自治区")) {
city = addr.substring(addr.indexOf("区") + 1, addr.indexOf("市"))
} else if (addr.contains("北京市")) {
city = addr.substring(addr.indexOf("市") + 1, addr.indexOf("区"))
} else if (addr.contains("上海市")) {
city = addr.substring(addr.indexOf("市") + 1, addr.indexOf("区"))
} else if (addr.contains("天津市")) {
city = addr.substring(addr.indexOf("市") + 1, addr.indexOf("区"))
} else if (addr.contains("重庆市")) {
if (addr.contains("区")) {
city = addr.substring(addr.indexOf("市") + 1, addr.indexOf("区"))
} else {
city = addr.substring(addr.indexOf("市") + 1, addr.indexOf("县"))
}
} else if (addr.contains("省")) {
if (addr.contains("市")) {
city = addr.substring(addr.indexOf("省") + 1, addr.indexOf("市"))
} else if (addr.contains("州")) {
city = addr.substring(addr.indexOf("省") + 1, addr.indexOf("州"))
}
} else if (addr.isEmpty || addr.equals("")) {
city = addr
}
} catch {
case e: Exception => {
println("city :" + addr + "\n" + "错误原因" + e.getMessage)
}
}
city
})
//运营模式
spark.udf.register("getBUS", (BUS: String) => {
var tp = ""
try {
if (BUS.contains("街机版") || BUS.contains("加强版")) {
tp = "自营"
} else if (BUS.contains("联营版") || BUS.contains("咪哒KSHOW(风机版)")) {
tp = "加盟"
} else {
tp = "其它"
}
} catch {
case e: Exception => {
println("type:" + tp + "\n" + e.getMessage)
}
}
tp
})
spark.sql("use songdb")
spark.sql(
s"""
|insert overwrite table TW_MAC_BASEINFO_D
|partition (data_dt='${inputdate}')
|select
|t1.MID MID ,
|t3.MAC_NM MAC_NM,
|t1.UI_VER SONG_WHSE_VER,
|t1.EXEC_VER EXEC_VER,
|t1.UI_VER UI_VER,
|t1.HARD_ID HARD_ID,
|t2.SALE_TM SALE_TM,
|t2.REV_TM REV_TM,
|"无" OPER_NM,
|getProvince(t2.addr) PRVC,
|getCity(t2.addr) CTY,
|t8.AREA AREA,
|nvl(t5.ADDR,t2.addr) ADDR,
|t5.STORE_NM STORE_NM,
|t5.SUB_TAG_NM SCENCE_CATGY,
|t5.SUB_SCENE_CATGY_NM SUB_SCENCE_CATGY,
|t5.TAG_NM SCENE, --主场景名称
|t5.SUB_SCENE_NM SUB_SCENE,
|t5.BRND_NM BRND,
|t5.SUB_BRND_NM SUB_BRND,
|t3.PKG_NM PRDCT_NM,
|t4.PRDCT_TYPE PRDCT_TYP,
|getBUS(t3.PKG_NM) BUS_MODE ,--运营模式
|t3.INV_RATE INV_RATE,
|t3.AGE_RATE AGE_RATE,
|t3.COM_RATE COM_RATE,
|t3.PAR_RATE PAR_RATE,
|t3.IS_ACTV IS_ACTV,
|t3.ACTV_TM ACTV_TM,
|t3.HAD_MPAY_FUNC PAY_SW,
|t4.ADMINID PRTN_NM,
|t1.CUR_LOGIN_TM CUR_LOGIN_TM
|
|from TO_YCAK_MAC_D t1
|left join TO_YCAK_MAC_LOC_D t2 on t1.mid = t2.mid
|left join TO_YCBK_MAC_ADMIN_MAP_D t3 on t1.mid = t3.mid
|left join TO_YCBK_MAC_STORE_MAP_D t4 on t3.mid = t4.mid
|left join TO_YCBK_STORE_D t5 on t4.store_id = t5.id
|left join TO_YCBK_CITY_D t6 on t5.prvc_id = t6.prvc_id
|left join TO_YCBK_CITY_D t7 on t5.cty_id = t7.cty_id
|left join TO_YCBK_AREA_D t8 on t5.area_id = t8.area_id
|
|""".stripMargin)
spark.close()
}
}
eds层machine - GenerateTwMacLocD
object GenerateTwMacLocD {
def main(args: Array[String]): Unit = {
if(args.size==0){
println("请输入日期:")
System.exit(1)
}
val inputdate = args(0)
val yesterday = DateUtils.getCurrentDatePreDate(inputdate,1)
var spark: SparkSession = null
if(ConfigUtils.LOCAL_RUN){
spark = SparkSession
.builder()
.appName(this.getClass.getName)
.master("local[*]")
.config("hive.metastore.uris", ConfigUtils.HIVE_METASTORE_URIS) //metastore地址
.config("spark.sql.shuffle.partition", 2)
.enableHiveSupport()
.getOrCreate()
}else {
spark = SparkSession
.builder()
.appName(this.getClass.getName)
.master("yarn")
.config("hive.metastore.uris", ConfigUtils.HIVE_METASTORE_URIS) //metastore地址
.config("spark.sql.shuffle.partition", 2)
.enableHiveSupport()
.getOrCreate()
}
spark.sparkContext.setLogLevel("WARN")
spark.sql("use songdb")
//一个机器可能存在很多上上报的经纬度,并且上报的经纬度可能不同
//根据mid、经纬度分组,统计出现次数最多的那个经纬度为该机器的经纬度
val df: DataFrame = spark.sql(
s"""
|select mid,x,y,cnt
|from(select mid,x,y,cnt,
|ROW_NUMBER() over(partition by mid order by cnt desc) rn
|from(
|select mid,lat x,lng y,COUNT(DISTINCT uid) cnt --对uid去重,原因是存在一个用户可能多次定位同一个机器,
|from to_ycak_usr_loc_d
|where data_dt = '${inputdate}' and lat !='' and lng !=''
|group by mid,lat,lng
|) t1
|) t2 where rn =1
|""".stripMargin)
val rdd: RDD[Row] = df.rdd.mapPartitions(iter => { //减少高德API的调用次数,高德API一次最大解析20个经纬度
val list3 = new ListBuffer[Row]
//将迭代器转换成list集合,(迭代器只能循环一次
val list: List[Row] = iter.toList
//存储发送请求的次数
var times = 0
if (list.size % 20 == 0) {
times = list.size / 20
} else {
times = list.size / 20 + 1
}
for (i <- 0 until times) {
val list1: List[Row] = list.slice(i * 20, (i + 1) * 20) //list1存储20条数据
var yx = "" //存储20个经纬度()
for (elem <- list1) {
val x: String = elem.getAs[String]("x")
val y: String = elem.getAs[String]("y")
yx += y + "," + x + "|"
}
val str: HttpResponse[String] = Http("https://restapi.amap.com/v3/geocode/regeo")
.param("key", "8e55d37b85c21037bea5521a2da03d2e")
.param("batch", "true")
.param("location", yx.substring(0, yx.length - 1)).asString
val jSONObject: JSONObject = JSON.parseObject(str.body)
if (!StringUtils.isBlank(str.body) && "10000".equals(jSONObject.getString("infocode"))) {
val jSONArray: JSONArray = jSONObject.getJSONArray("regeocodes")
for (i <- 0 until jSONArray.size()) {
val currentJsonObject: JSONObject = jSONArray.getJSONObject(i)
val MID = list1(i).getAs[String]("mid").toInt
val X = list1(i).getAs[String]("x")
val Y = list1(i).getAs[String]("y")
val CNT = list1(i).getAs[Long]("cnt").toInt
val ADDER = currentJsonObject.getString("formatted_address")
val PRVC: String = currentJsonObject.getJSONObject("addressComponent").getString("province")
val CTY: String = currentJsonObject.getJSONObject("addressComponent").getString("city")
val CTY_CD: String = currentJsonObject.getJSONObject("addressComponent").getString("citycode")
val DISTRICT: String = currentJsonObject.getJSONObject("addressComponent").getString("district")
val AD_CD = currentJsonObject.getJSONObject("addressComponent").getString("adcode")
val TOWN_SHIP = currentJsonObject.getJSONObject("addressComponent").getString("township")
val TOWN_CD = currentJsonObject.getJSONObject("addressComponent").getString("towncode")
val NB_NM = currentJsonObject.getJSONObject("addressComponent").getJSONObject("neighborhood").getString("name")
val NB_TP = currentJsonObject.getJSONObject("addressComponent").getJSONObject("neighborhood").getString("type")
val BD_NM = currentJsonObject.getJSONObject("addressComponent").getJSONObject("building").getString("name")
val BD_TP = currentJsonObject.getJSONObject("addressComponent").getJSONObject("building").getString("type")
val STREET = currentJsonObject.getJSONObject("addressComponent").getJSONObject("streetNumber").getString("street")
val STREET_NB = currentJsonObject.getJSONObject("addressComponent").getJSONObject("streetNumber").getString("number")
val STREET_LOC = currentJsonObject.getJSONObject("addressComponent").getJSONObject("streetNumber").getString("location")
val STREET_DRCTION = currentJsonObject.getJSONObject("addressComponent").getJSONObject("streetNumber").getString("direction")
val STREET_DSTANCE = currentJsonObject.getJSONObject("addressComponent").getJSONObject("streetNumber").getString("distance")
val BUS_INFO = currentJsonObject.getJSONObject("addressComponent").getString("businessAreas")
list3.append(Row(MID, X, Y, CNT, ADDER, PRVC, CTY, CTY_CD, DISTRICT, AD_CD, TOWN_SHIP, TOWN_CD, NB_NM, NB_TP, BD_NM, BD_TP, STREET, STREET_NB, STREET_LOC, STREET_DRCTION, STREET_DSTANCE, BUS_INFO))
}
}
}
list3.iterator
})
//将RDD转为dataFrame
//1.rdd.toDF
//2.spark.createDataFrame(rdd, schema)
val schema: StructType = StructType(Array[StructField](
StructField("MID", IntegerType),
StructField("X", StringType),
StructField("Y", StringType),
StructField("CNT", IntegerType),
StructField("ADDER", StringType),
StructField("PRVC", StringType),
StructField("CTY", StringType),
StructField("CTY_CD", StringType),
StructField("DISTRICT", StringType),
StructField("AD_CD", StringType),
StructField("TOWN_SHIP", StringType),
StructField("TOWN_CD", StringType),
StructField("NB_NM", StringType),
StructField("NB_TP", StringType),
StructField("BD_NM", StringType),
StructField("BD_TP", StringType),
StructField("STREET", StringType),
StructField("STREET_NB", StringType),
StructField("STREET_LOC", StringType),
StructField("STREET_DRCTION", StringType),
StructField("STREET_DSTANCE", StringType),
StructField("BUS_INFO", StringType)
))
val df2: DataFrame = spark.createDataFrame(rdd, schema)
df2.createTempView("mac_location_day")
//select id from 表1 except select id from 表2 筛选出表1独有的数据,差集
//yesterday的tw_mac_loc_d表中的数据和今天上报数据统计的结果(mac_location_day)求差集,获取mid
//根据上边的交集编号,获取对应的机器详细,
//和今天上报的数据统计结果union在一起,得到所有机器的位置信息
val df3: DataFrame = spark.sql(
s"""
|select t2.*
|from(
|select mid from tw_mac_loc_d where data_dt = '${yesterday}'
|except
|select mid from mac_location_day
|)t1 left join tw_mac_loc_d t2 on t1.mid = t2.mid
|where t2.data_dt = '${yesterday}'
|
|""".stripMargin)
//df3的数据是来自于tw_mac_loc_d表,所以相比较df2多了一个data_dt字段
//drop("data_dt") 删除dataFrame中的字段
df3.drop("data_dt").union(df2).createTempView("result")
spark.sql(
s"""
|insert overwrite table tw_mac_loc_d partition (data_dt = '${inputdate}')
|select * from result
|
|""".stripMargin)
spark.close()
}
}
eds层machine - GenerateTwMacStatD
object GenerateTwMacStatD {
def main(args: Array[String]): Unit = {
if(args.size==0){
println("请输入日期:")
System.exit(1)
}
val inputdate = args(0)
var spark: SparkSession = null
if(ConfigUtils.LOCAL_RUN){
spark = SparkSession
.builder()
.appName(this.getClass.getName)
.master("local[*]")
.config("hive.metastore.uris", ConfigUtils.HIVE_METASTORE_URIS) //metastore地址
.config("spark.sql.shuffle.partition", 2)
.enableHiveSupport()
.getOrCreate()
}else {
spark = SparkSession
.builder()
.appName(this.getClass.getName)
.master("yarn")
.config("hive.metastore.uris", ConfigUtils.HIVE_METASTORE_URIS) //metastore地址
.config("spark.sql.shuffle.partition", 2)
.enableHiveSupport()
.getOrCreate()
}
spark.sql("use songdb")
//此处,退款和消费按照mid、套餐id、支付类型分组
spark.sql(
s"""
|select mid,
|pkg_id,
|pay_type,
|count(distinct uid) CNSM_USR_CNT ,
|count(ORDR_ID) REV_ORDR_CNT,
|sum(coin_prc*coin_cnt) tot_rev
|from tw_cnsm_brief_d
|where data_dt = '${inputdate}' and abn_typ = 0
|group by mid,pkg_id,pay_type
|
|""".stripMargin).createTempView("cnsm_rev")
spark.sql(
s"""
|select mid,
|pkg_id,
|pay_type,
|count(distinct uid) REF_USR_CNT ,
|count(ORDR_ID) REF_ORDR_CNT,
|abs(sum(coin_prc*coin_cnt)) TOT_REF
|from tw_cnsm_brief_d
|where data_dt = '${inputdate}' and abn_typ = 2
|group by mid,pkg_id,pay_type
|""".stripMargin).createTempView("cnsm_ref")
//新增用户数:查询注册日期在今天的所用用户数
spark.sql(
s"""
|select reg_mid mid,count(*) NEW_USR_CNT -- 新增用户
|from tw_usr_baseinfo_d
|where data_dt = '${inputdate}' -- and reg_dt = '${inputdate}'
|group by reg_mid
|""".stripMargin).createTempView("new_user")
spark.sql(
s"""
|insert overwrite table TW_MAC_STAT_D partition (data_dt='${inputdate}')
|select
| t1.MID,
| t1.MAC_NM,
| t1.PRDCT_TYP PRDCT_TYPE,
| t1.STORE_NM,
| t1.BUS_MODE,
| t1.PAY_SW,
| t1.SCENCE_CATGY,
| t1.SUB_SCENCE_CATGY,
| t1.SCENE,
| t1.SUB_SCENE,
| t1.BRND,
| t1.SUB_BRND,
| t2.PRVC,
| t2.CTY,
| t2.DISTRICT AREA,
| t1.PRTN_NM AGE_ID,
| t1.INV_RATE,
| t1.AGE_RATE,
| t1.COM_RATE,
| t1.PAR_RATE,
| t3.PKG_ID,
| t3.PAY_TYPE,
| nvl(t3.CNSM_USR_CNT,0),
| nvl(t4.REF_USR_CNT,0),
| nvl(t5.NEW_USR_CNT,0),
| nvl(t3.REV_ORDR_CNT,0),
| nvl(t4.REF_ORDR_CNT,0),
| nvl(t3.TOT_REV,0),
| nvl(t4.TOT_REF,0)
|from tw_mac_baseinfo_d t1
|left join tw_mac_loc_d t2 on t1.mid = t2.mid
|left join cnsm_rev t3 on t1.mid = t3.mid
|left join cnsm_ref t4 on t3.mid = t4.mid and t3.PKG_ID = t4.PKG_ID and t3.PAY_TYPE = t4.PAY_TYPE
|left join new_user t5 on t1.mid = t5.mid
|where t1.data_dt ='2022-03-20' and t2.data_dt='${inputdate}'
|
|""".stripMargin)
spark.close()
}
}
eds层content - GenerateTwSongBaseinfoD
object GenerateTwSongBaseinfoD {
def main(args: Array[String]): Unit = {
val inputdate = args(0)
if(inputdate.size==0){
println("请输入正确的日期")
System.exit(1) //程序终止
}
var spark: SparkSession = null
if(ConfigUtils.LOCAL_RUN) {
spark = SparkSession
.builder()
.master("local[*]")
.appName(this.getClass.getName)
.config("hive.metastore.uris",ConfigUtils.HIVE_METASTORE_URIS)
.config("spark.sql.shuffle.partitions",2)
.enableHiveSupport()
.getOrCreate()
}else {
spark = SparkSession
.builder()
.master("yarn")
.appName(this.getClass.getName)
.config("hive.metastore.uris", ConfigUtils.HIVE_METASTORE_URIS)
.config("spark.sql.shuffle.partitions",2)
.enableHiveSupport()
.getOrCreate()
}
//定义单行函数,获取ALBUM字段json中的专辑名称
//
spark.udf.register("getAlbum",(album:String)=>{
var name:String = null
try{
name= JSON.parseArray(album).getJSONObject(0).getString("name")
}catch {
case e:Exception => {
if (album != null && album.startsWith("《") && album.endsWith("》")){
name = album
}else {
name = "《暂无专辑》"
}
}
}
name
})
//定义单行函数,获取SINGER_INFO字段json中的歌手姓名,歌手id
spark.udf.register("getSingerName_ID",(singerinfo:String,singer:String,nameOrID:String)=> {
var singerNameOrID=""
try{
val singerARR: JSONArray = JSON.parseArray(singerinfo)
if("singer1".equals(singer) && "name".equals(nameOrID) && singerARR.size()>0){
singerNameOrID=singerARR.getJSONObject(0).getString("name")
}else if ("singer1".equals(singer) && "id".equals(nameOrID) && singerARR.size()>0){
singerNameOrID=singerARR.getJSONObject(0).getString("id")
}else if ("singer2".equals(singer) && "name".equals(nameOrID) && singerARR.size()>1){
singerNameOrID=singerARR.getJSONObject(1).getString("name")
}else if ("singer2".equals(singer) && "id".equals(nameOrID) && singerARR.size()>1){
singerNameOrID=singerARR.getJSONObject(1).getString("id")
}
}catch{
case e:Exception => {
println("singerinfo = " + singerinfo + ",error = " + e.getMessage)
}
}
singerNameOrID
})
//定义单行函数,提取AUTH_CO的name值,如果没有则使用乐心曲库
spark.udf.register("getAUTH_NAME",(AUTH_CO:String)=>{
var AUTH_NAME: String = "乐心曲库" //默认值
try {
//如果AUTH_CO不为空,进入循环
val jSONObject: JSONObject = JSON.parseObject(AUTH_CO)
AUTH_NAME = jSONObject.getString("name")
}catch {
case e:Exception => {
AUTH_NAME=AUTH_CO
}
}
AUTH_NAME
})
//定义单行函数,将PRDCT_TYPE中String转换为数组
spark.udf.register("stringTOArray",(PRDCT_TYPE:String)=>{
var list = new ListBuffer[Int]
try{
if(!StringUtils.isBlank(PRDCT_TYPE)){
val str: Array[String] = PRDCT_TYPE.stripSuffix("]").stripPrefix("[").split(",")
for(t <-str){
list.append(t.toDouble.toInt)
}
}
}
catch {
case e:Exception => {
println("stringTOArray: "+e.getMessage)
}
}
list
})
//定义单行函数,将post_time转换格式
spark.udf.register("date",(post_time:String)=>{
var time:String = ""
try{
val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
time = sdf.format(post_time.toDouble)
}catch {
case e:Exception => {
time = post_time
}
}
time
})
spark.sql("use songdb")
val sql =
"""
|insert overwrite table TW_SONG_BASEINFO_D
|select nbr,
|nvl(name,other_name) name,
|source,
|getAlbum(album) album,
|prdct,lang,video_format,dur,
|getSingerName_ID(singer_info,"singer1","name") singer1,
|getSingerName_ID(singer_info,"singer2","name") singer2,
|getSingerName_ID(singer_info,"singer1","id") singer1id,
|getSingerName_ID(singer_info,"singer2","id") singer2id,
|0 mactime,
|date(post_time) post_time,
|pinyin_fst,
|pinyin,
|sing_type,
|ori_singer,
|lyricist,
|composer,
|bpm_val,
|star_level,
|video_qlty,
|video_mk,
|video_ftur,
|lyric_ftur,
|IMG_QLTY,
|subtitles_type,
|AUDIO_FMT,
|ORI_SOUND_QLTY,
|ORI_TRK,
|ORI_TRK_VOL,
|ACC_VER,
|ACC_QLTY,
|ACC_TRK_VOL,
|ACC_TRK,
|WIDTH,
|HEIGHT,
|VIDEO_RSVL,
|SONG_VER,
|getAUTH_NAME(AUTH_CO) AUTH_CO,
|STATE,
|stringTOArray(PRDCT_TYPE) PRDCT_TYPE
|from to_song_info_d
|where nbr != " "
|""".stripMargin
spark.sql(sql)
spark.close()
}
}
dm层content - GenerateTmSongerRsiD
object GenerateTmSongerRsiD
{
def main(args: Array[String]): Unit = {
var spark: SparkSession = null
if(ConfigUtils.LOCAL_RUN) {
spark = SparkSession
.builder()
.master("local[*]")
.appName(this.getClass.getName)
.config("hive.metastore.uris",ConfigUtils.HIVE_METASTORE_URIS)
.config("spark.sql.shuffle.partitions",2)
.enableHiveSupport()
.getOrCreate()
}else {
spark = SparkSession
.builder()
.master("yarn")
.appName(this.getClass.getName)
.config("hive.metastore.uris", ConfigUtils.HIVE_METASTORE_URIS)
.config("spark.sql.shuffle.partitions",2)
.enableHiveSupport()
.getOrCreate()
}
var inputdate = args(0)
if(inputdate.size == 0){
println("请输入日期")
System.exit(1)
}
spark.sql("use songdb")
spark.sql(
s"""
|--一天的数据
select singer ,singerid,
|sum(sing_cnt) sing_cnt,
|sum(supp_cnt) supp_cnt,
|SUM(RCT_7_SING_CNT) RCT_7_SING_CNT,
|sum(RCT_7_SUPP_CNT) RCT_7_SUPP_CNT,
|sum(RCT_7_TOP_SING_CNT) RCT_7_TOP_SING_CNT,
|sum(RCT_7_TOP_SUPP_CNT) RCT_7_TOP_SUPP_CNT,
|SUM(RCT_30_SING_CNT) RCT_30_SING_CNT,
|sum(RCT_30_SUPP_CNT) RCT_30_SUPP_CNT,
|sum(RCT_30_TOP_SING_CNT) RCT_30_TOP_SING_CNT,
|sum(RCT_30_TOP_SUPP_CNT) RCT_30_TOP_SUPP_CNT
|from(
|select singer1 singer,singer1id singerid,sing_cnt,supp_cnt,RCT_7_SING_CNT,RCT_7_SUPP_CNT,RCT_30_SING_CNT,RCT_30_SUPP_CNT,
|RCT_7_TOP_SING_CNT,RCT_7_TOP_SUPP_CNT,RCT_30_TOP_SING_CNT,RCT_30_TOP_SUPP_CNT
|from TW_SONG_FTUR_D
|where data_dt = '${inputdate}'and singer2 is not null and singer2 != ""
|union all
|select singer2 singer,singer2id singerid,sing_cnt,supp_cnt,RCT_7_SING_CNT,RCT_7_SUPP_CNT,RCT_30_SING_CNT,RCT_30_SUPP_CNT,
|RCT_7_TOP_SING_CNT,RCT_7_TOP_SUPP_CNT,RCT_30_TOP_SING_CNT,RCT_30_TOP_SUPP_CNT
|from TW_SONG_FTUR_D
|where data_dt = '${inputdate}' and singer2 is not null and singer2 != ""
|) t1
|group by singer,singerid
|""".stripMargin).createTempView("singerHeat")
spark.sql(
s"""
|insert overwrite table TW_SINGER_RSI_D
|partition (data_dt ='${inputdate}')
|select 1 period ,t1.singer_id singer_id,t1.singer_name singer_name,t1.rsi rsi,
|rank() over (order by t1.rsi desc) rsi_rank
|from(
|select
|singerid singer_id,
|singer singer_name,
|pow((log(sing_cnt/1+1)*0.63+log(sing_cnt+1)*0.37)*0.8+(log(supp_cnt/1+1)*0.63+log(supp_cnt+1)*0.37)*0.2,2)*10 RSI
|from singerHeat
|) t1
|union all
|select 7 period ,t2.singer_id singer_id,t2.singer_name singer_name,t2.rsi rsi,
|rank() over (order by t2.rsi desc) rsi_rank
|from(
|select
|singerid singer_id,
|singer singer_name,
|pow((log(RCT_7_SING_CNT/7+1)*0.63+log(RCT_7_TOP_SING_CNT+1)*0.37)*0.8+(log(RCT_7_SUPP_CNT/7+1)*0.63+log(RCT_7_TOP_SUPP_CNT+1)*0.37)*0.2,2)*10 RSI
|from singerHeat
|) t2
|union all
|select 30 period ,t3.singer_id singer_id,t3.singer_name singer_name,t3.rsi rsi,
|rank() over (order by t3.rsi desc) rsi_rank
|from(
|select
|singerid singer_id,
|singer singer_name,
|pow((log(RCT_30_SING_CNT/30+1)*0.63+log(RCT_30_TOP_SING_CNT+1)*0.37)*0.8+(log(RCT_30_SUPP_CNT/30+1)*0.63+log(RCT_30_TOP_SUPP_CNT+1)*0.37)*0.2,2)*10 RSI
|from singerHeat
|) t3
|""".stripMargin)
val pro= new Properties()
pro.put("user",ConfigUtils.MYSQL_USER)
pro.put("password",ConfigUtils.MYSQL_PASSWORD)
val df: DataFrame = spark.sql(
s"""
|select * from TW_SINGER_RSI_D where data_dt = '${inputdate}'
|""".stripMargin)
df.write.mode(SaveMode.Append)jdbc(ConfigUtils.MYSQL_URL,"tm_singer_rsi",pro)
spark.close()
}
}
dm层content - GenerateTmSongRsiD
object GenerateTmSongRsiD {
def main(args: Array[String]): Unit = {
var spark: SparkSession = null
if(ConfigUtils.LOCAL_RUN) {
spark = SparkSession
.builder()
.master("local[*]")
.appName(this.getClass.getName)
.config("hive.metastore.uris",ConfigUtils.HIVE_METASTORE_URIS)
.config("spark.sql.shuffle.partitions",2)
.enableHiveSupport()
.getOrCreate()
}else {
spark = SparkSession
.builder()
.master("yarn")
.appName(this.getClass.getName)
.config("hive.metastore.uris", ConfigUtils.HIVE_METASTORE_URIS)
.config("spark.sql.shuffle.partitions",2)
.enableHiveSupport()
.getOrCreate()
}
var inputdate = args(0)
if(inputdate.size == 0){
println("请输入日期")
System.exit(1)
}
spark.sql("use songdb")
spark.sql(
s"""
|insert overwrite table TW_SONG_RSI_D
|partition (data_dt = '${inputdate}')
|select 1 period,t1.nbr,t1.name,t1.rsi,
|rank() over(order by t1.rsi desc) rsi_rank
|from (
|select nbr,name,
--pow(10,2) 表示求10的2次方 == 100
--log10 表示求e的多少次方 == 10
|pow((log(sing_cnt/1+1)*0.63+log(sing_cnt+1)*0.37)*0.8+(log(supp_cnt/1+1)*0.63+log(supp_cnt+1)*0.37)*0.2,2)*10 RSI
|from TW_SONG_FTUR_D
|where data_dt = '${inputdate}'
|) t1
|union all
|select 7 period,t2.nbr,t2.name,t2.rsi,
|rank() over(order by t2.rsi desc) rsi_rank
|from (
|select nbr,name,
|pow((log(RCT_7_SING_CNT/7+1)*0.63+log(RCT_7_TOP_SING_CNT+1)*0.37)*0.8+(log(RCT_7_SUPP_CNT/7+1)*0.63+log(RCT_7_TOP_SUPP_CNT+1)*0.37)*0.2,2)*10 RSI
|from TW_SONG_FTUR_D
|where data_dt = '${inputdate}'
|) t2
|union all
|select 30 period,t3.nbr,t3.name,t3.rsi,
|rank() over(order by t3.rsi desc) rsi_rank
|from (
|select nbr,name,
|pow((log(RCT_30_SING_CNT/30+1)*0.63+log(RCT_30_TOP_SING_CNT+1)*0.37)*0.8+(log(RCT_30_SUPP_CNT/30+1)*0.63+log(RCT_30_TOP_SUPP_CNT+1)*0.37)*0.2,2)*10 RSI
|from TW_SONG_FTUR_D
|where data_dt = '${inputdate}'
|) t3
|""".stripMargin)
val pro= new Properties()
pro.put("user",ConfigUtils.MYSQL_USER)
pro.put("password",ConfigUtils.MYSQL_PASSWORD)
/*df.withColumn("rsi",
(log(col("sing_cnt")/1+1)*0.63+log(col("sing_cnt")+1)*0.37)*0.8+
(log(col("supp_cnt")/1+1)*0.63+log(col("supp_cnt")+1)*0.37)*0.2
)*/
val df: DataFrame = spark.sql(
s"""
|select * from TW_SONG_RSI_D where data_dt = '${inputdate}'
|""".stripMargin)
df.write.mode(SaveMode.Append).jdbc(ConfigUtils.MYSQL_URL,"tm_song_rsi",pro)
spark.close()
}
}
dm层machine - GenerateTmMacRegionStatD
object GenerateTmMacRegionStatD {
def main(args: Array[String]): Unit = {
val inputdate = args(0)
if(inputdate.size==0){
println("请输入正确的日期")
System.exit(1) //程序终止
}
var spark: SparkSession = null
if(ConfigUtils.LOCAL_RUN) {
spark = SparkSession
.builder()
.master("local[*]")
.appName(this.getClass.getName)
.config("hive.metastore.uris",ConfigUtils.HIVE_METASTORE_URIS)
.config("spark.sql.shuffle.partitions",2)
.enableHiveSupport()
.getOrCreate()
}else {
spark = SparkSession
.builder()
.master("yarn")
.appName(this.getClass.getName)
.config("hive.metastore.uris", ConfigUtils.HIVE_METASTORE_URIS)
.config("spark.sql.shuffle.partitions",2)
.enableHiveSupport()
.getOrCreate()
}
spark.sql("use songdb")
spark.sql(
s"""
|insert overwrite table TM_MAC_REGION_STAT_D partition (data_dt = '${inputdate}')
|SELECT
|prvc,
|cty,
|count(DISTINCT mid) mac_cnt,
|sum(tot_rev) mac_rev,
|sum(tot_ref) mac_ref,
|sum(REV_ORDR_CNT) MAC_REV_ORDR_CNT,
|sum(REF_ORDR_CNT) MAC_REF_ORDR_CNT,
|sum(CNSM_USR_CNT) MAC_CNSM_USR_CNT,
|sum(REF_USR_CNT) MAC_REF_USR_CNT
|from tw_mac_stat_d
|where data_dt = '${inputdate}' and cty != '[]'
|group by prvc,cty
|
|""".stripMargin)
val pro = new Properties()
pro.setProperty("user",ConfigUtils.MYSQL_USER)
pro.setProperty("password",ConfigUtils.MYSQL_PASSWORD)
val df: DataFrame = spark.sql(s"select * from TM_MAC_REGION_STAT_D where data_dt = '${inputdate}'")
df.write.mode(SaveMode.Overwrite).jdbc(ConfigUtils.MYSQL_URL,"TM_MAC_REGION_STAT_D",pro)
spark.close()
}
}
dm层 - GenerateTmUsrMrchntStatD
object GenerateTmUsrMrchntStatD {
def main(args: Array[String]): Unit = {
if(args.size==0){
println("请输入日期:")
System.exit(1)
}
val inputdate = args(0)
var spark: SparkSession = null
if(ConfigUtils.LOCAL_RUN){
spark = SparkSession
.builder()
.appName(this.getClass.getName)
.master("local[*]")
.config("hive.metastore.uris", ConfigUtils.HIVE_METASTORE_URIS) //metastore地址
.config("spark.sql.shuffle.partition", 2)
.enableHiveSupport()
.getOrCreate()
}else {
spark = SparkSession
.builder()
.appName(this.getClass.getName)
.master("yarn")
.config("hive.metastore.uris", ConfigUtils.HIVE_METASTORE_URIS) //metastore地址
.config("spark.sql.shuffle.partition", 2)
.enableHiveSupport()
.getOrCreate()
}
spark.sql("use songdb")
spark.sql(
s"""
|insert overwrite table TM_USR_MRCHNT_STAT_D partition (data_dt = '${inputdate}')
|select
|AGE_ID,
|PAY_TYPE,
|sum(REV_ORDR_CNT) REV_ORDR_CNT, -- 总营收订单数
|sum(REF_ORDR_CNT) REF_ORDR_CNT, -- 总退款订单数
|cast(sum(TOT_REV) as double) TOT_REV, -- 总营收
|cast(sum(TOT_REF) as double) TOT_REF, --总退款
|cast(sum(TOT_REV * nvl(INV_RATE,0)*0.01) as decimal(10,2)) TOT_INV_REV, -- 投资人营收 = 营收比例 * 总营收
|cast(sum(TOT_REV * nvl(AGE_RATE,0)*0.01) as decimal(10,2)) TOT_AGE_REV, -- 代理人营收
|cast(sum(TOT_REV * nvl(COM_RATE,0)*0.01) as decimal(10,2)) TOT_COM_REV, -- 公司营收
|cast(sum(TOT_REV * nvl(PAR_RATE,0)*0.01) as decimal(10,2)) TOT_PAR_REV -- 合伙人营收
|from tw_mac_stat_d
|where data_dt = '${inputdate}'
|group by AGE_ID,PAY_TYPE
|
|""".stripMargin)
val prop = new Properties()
prop.setProperty("user",ConfigUtils.MYSQL_USER)
prop.setProperty("password",ConfigUtils.MYSQL_PASSWORD)
val df: DataFrame = spark.sql(s"select * from TM_USR_MRCHNT_STAT_D where data_dt = '${inputdate}'")
df.write.mode(SaveMode.Overwrite).jdbc(ConfigUtils.MYSQL_URL,"TM_USR_MRCHNT_STAT_D",prop)
spark.close()
}
}
streaming层 - RealTimeHotSong
object RealTimeHotSong {
def main(args: Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf().setMaster("yarn").setAppName(this.getClass.getName)
val ssc = new StreamingContext(sparkConf, Seconds(10))
ssc.sparkContext.setLogLevel("WARN")
val spark: SparkSession = SparkSession
.builder()
.master("yarn")
.appName(this.getClass.getName)
.config("hive.metastore.uris", ConfigUtils.HIVE_METASTORE_URIS) //metastore地址
.config("spark.sql.shuffle.partitions", 1)
.getOrCreate()
import spark.implicits._
val kafkaMap = Map(
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer",
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer",
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> ConfigUtils.KAFKA_BROKERS,
ConsumerConfig.GROUP_ID_CONFIG -> "g2"
)
val ds: InputDStream[ConsumerRecord[String,String]] = KafkaUtils.createDirectStream(ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String,String](List("mytopic2"), kafkaMap)
)
val ds2: DStream[(String, Int)] = ds.map(v => {
val songname: String = JSON.parseObject(v.value()).getString("songname")
(songname, 1)
}).reduceByKeyAndWindow((v1: Int, v2: Int) => v1 + v2, Seconds(60), Seconds(10))
val pro = new Properties()
pro.setProperty("user",ConfigUtils.MYSQL_USER)
pro.setProperty("password",ConfigUtils.MYSQL_PASSWORD)
//dStream --> rdd --> DataFrame
ds2.foreachRDD(rdd => {
rdd.toDF("songname","cnt").createOrReplaceTempView("t_song")//toDF需要隐式转换
spark.sql(
"""
|select songname,cnt,rn
|from (
|select songname,cnt,row_number() over(order by cnt desc) rn
|from t_song) t1
|where rn<=30
|
|""".stripMargin).write.mode(SaveMode.Overwrite).jdbc(ConfigUtils.MYSQL_URL,"song_rn",pro)
})
/*val sparkConf = new SparkConf().setMaster("local[*]").setAppName("RealTimeHotSong")
val ssc = new StreamingContext(sparkConf,Seconds(10))
//-------------------------- 引入sparkSession对象 导入隐式转换的代码 -------------------
val spark: SparkSession = SparkSession
.builder()
.master("local[2]")
.appName("stream")
.config("spark.sql.shuffle.partitions", 1)
.getOrCreate()
//需要加上隐式转换 -- RDD → DataFrame
import spark.implicits._
//---------------------------------------------------------------------------------
val map = Map[String,Object](
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG->classOf[StringDeserializer],
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG->classOf[StringDeserializer],
ConsumerConfig.GROUP_ID_CONFIG->"g1",
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG->ConfigUtils.KAFKA_BROKERS
)
val ds: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String,String](List("mytopic2"), map))
val ds2: DStream[(String, Int)] = ds.map(v => {
val v2: String = v.value()
val jSONObject: JSONObject = JSON.parseObject(v2)
val songName: String = jSONObject.getString("songname")
(songName, 1)
}).reduceByKeyAndWindow((v1: Int, v2: Int) => v1 + v2, Seconds(60), Seconds(10))
val prop = new Properties()
prop.setProperty("user",ConfigUtils.MYSQL_USER)
prop.setProperty("password",ConfigUtils.MYSQL_PASSWORD)
//获取热度最高的前30首歌曲
//dStream --> rdd --> DataFrame
ds2.foreachRDD(rdd => {
rdd.toDF("songname","cnt").createOrReplaceTempView("t_song")
spark.sql(
"""
|
|select songname,cnt,rn from(
| select songname,cnt,row_number() over(order by cnt desc) rn
| from t_song
|)t1 where rn <= 30
|
|""".stripMargin).write.mode(SaveMode.Overwrite).jdbc(ConfigUtils.MYSQL_URL,"song_rn",prop)
})*/
ssc.start()
ssc.awaitTermination()
}
}
streaming层 - RealTimePVUV
object RealTimePVUV {
def main(args: Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf().setMaster("yarn").setAppName(this.getClass.getName)
val ssc = new StreamingContext(sparkConf, Seconds(10))
ssc.sparkContext.setLogLevel("WARN")
val kafkaMap = Map(
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer",
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer",
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> ConfigUtils.KAFKA_BROKERS,
ConsumerConfig.GROUP_ID_CONFIG -> "g1"
)
val ds: InputDStream[ConsumerRecord[String,String]] = KafkaUtils.createDirectStream(ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String,String](List("mytopic"), kafkaMap)
)
ds.map(v => {//v是一个DStream对象,v.value可以获取数据
val mid: String = JSON.parseObject(v.value()).getString("mid")
(mid,1)
}).reduceByKeyAndWindow((v1:Int,v2:Int)=>v1+v2,Seconds(60),Seconds(10))
.foreachRDD(rdd => {
rdd.foreachPartition(iter => {
val jedis = new Jedis("hadoop12",6379)
jedis.auth("123")
iter.foreach(v=>{
jedis.hset("pv",v._1,v._2.toString)
})
jedis.close()
})
})
//统计uv
ds.map(v => {//v是一个DStream对象,v.value可以获取数据
val mid: String = JSON.parseObject(v.value()).getString("mid")
val uid: String = JSON.parseObject(v.value()).getString("uid")
(mid,uid)
}).transform(rdd => rdd.distinct()).map(v=>(v._2,1)).reduceByKeyAndWindow((v1:Int,v2:Int)=>v1+v2,Seconds(60),Seconds(10))
.foreachRDD(rdd => {
rdd.foreachPartition(iter => {
val jedis = new Jedis("hadoop12",6379)
jedis.auth("123")
iter.foreach(v=>{
jedis.hset("uv",v._1,v._2.toString)
})
jedis.close()
})
})
ssc.start()
ssc.awaitTermination()
}
}
common层
object ConfigUtils {
/**
* ConfigFactory.load() 默认加载classpath下的application.conf,application.json和application.properties文件。
*
*/
lazy val load: Config = ConfigFactory.load()
val LOCAL_RUN = load.getBoolean("local.run")
val HIVE_METASTORE_URIS = load.getString("hive.metastore.uris")
val HIVE_DATABASE = load.getString("hive.database")
val HDFS_CLIENT_LOG_PATH = load.getString("clientlog.hdfs.path")
val MYSQL_URL = load.getString("mysql.url")
val MYSQL_USER = load.getString("mysql.user")
val MYSQL_PASSWORD = load.getString("mysql.password")
val TOPIC = load.getString("kafka.userloginInfo.topic")
val USER_PLAY_SONG_TOPIC = load.getString("kafka.userplaysong.topic")
val KAFKA_BROKERS = load.getString("kafka.cluster")
val REDIS_HOST = load.getString("redis.host")
val REDIS_PORT = load.getInt("redis.port")
val REDIS_OFFSET_DB = load.getInt("redis.offset.db")
val REDIS_DB = load.getInt("redis.db")
}
object DateUtils {
/**
* 将字符串格式化成"yyyy-MM-dd HH:mm:ss"格式
* @param stringDate
* @return
*/
def formatDate(stringDate:String):String = {
val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
var formatDate = ""
try{
formatDate = sdf.format(sdf.parse(stringDate))
}catch{
case e:Exception=>{
try{
val bigDecimal = new BigDecimal(stringDate)
val date = new Date(bigDecimal.longValue())
formatDate = sdf.format(date)
}catch{
case e:Exception=>{
formatDate
}
}
}
}
formatDate
}
/**
* 获取输入日期的前几天的日期
* @param currentDate yyyyMMdd
* @param i 获取前几天的日期
* @return yyyyMMdd
*/
def getCurrentDatePreDate(currentDate:String,i:Int) = {
val sdf = new SimpleDateFormat("yyyy-MM-dd")
val date: Date = sdf.parse(currentDate)
val calendar = Calendar.getInstance()
calendar.setTime(date)
calendar.add(Calendar.DATE,-i)
val perDate = calendar.getTime
sdf.format(perDate)
}
}
object StringUtils {
/**
* 检查字符串是否为空
* @param str
* @return
*/
def checkString(str:String) = {
if(str == null || "".equals(str)) "" else str
}
}
文件名称 - PairRDDMultipleTextOutputFormat
class PairRDDMultipleTextOutputFormat extends MultipleTextOutputFormat[String,String]{
//使用元组中的key作为文件名
override def generateFileNameForKeyValue(key: String, value: String, name: String): String = key
//不让他生成key
override def generateActualKey(key: String, value: String): String = null
}