将ODS层的表TO_SONG_INFO_D清洗到EDS层TW_SONG_BASEINFO_D,可以使用HQL或者sparksql去完成,会用到定义UDF函数,或者一些聚合的使用UDAF函数,还有UDTF函数,写一个类extends UDF实现它的方法,现在使用的是sparkSQL,里面也有UDF和UDAF函数,但是没有UDTF函数,有一个算子叫做flatMap就是一对多的。同样的,SQL里面还有一个explode函数,完全可以去实现一对多。
这里主要使用的是sparkSQL去清洗的,也使用到了sparkSQL的UDF函数。
TO_SONG_INFO_D:
TW_SONG_BASEINFO_D:
回顾sparkSQL:
package test
import org.apache.spark.sql.{DataFrame, SparkSession}
object Test2 {
def main(args: Array[String]): Unit = {
val session: SparkSession = SparkSession.builder().master("local").appName("test").getOrCreate()
session.sparkContext.setLogLevel("ERROR")
val list: List[String] = List[String]("zhangsan","lisi","wangwu")
// 想要将集合转换成dataFrame 需要导入隐式转换
import session.implicits._
val df: DataFrame = list.toDF("name")
df.show()
}
}
打印结果:
+--------+
| name|
+--------+
|zhangsan|
| lisi|
| wangwu|
+--------+
创建临时表使用SQL语句查询:
df.createOrReplaceTempView("person")
session.sql(
"""
|select * from person
""".stripMargin).show()
DataFrame的相关api复习:
import org.apache.spark.sql.functions._
// 给当前DataFrame增加一个列,列名是name,值是18
df.withColumn("age",lit(18)).show()
+--------+---+
| name|age|
+--------+---+
|zhangsan| 18|
| lisi| 18|
| wangwu| 18|
+--------+---+
// 如何定义的UDF函数
val getNameLength = (s:String) => {
s.length
}
//定义函数
val myudf = udf(getNameLength)
df.withColumn("age",lit(18))
.withColumn("namelen",myudf(col("name")))
.show()
+--------+---+-------+
| name|age|namelen|
+--------+---+-------+
|zhangsan| 18| 8|
| lisi| 18| 4|
| wangwu| 18| 6|
+--------+---+-------+
//第二种方式
session.udf.register("xxxudf",(s:String)=>{
s.length
})
scala文件:GenerateTwSongBaseinfoD
package com.msbjy.scala.musicproject.eds.content
import com.alibaba.fastjson.{JSON, JSONArray, JSONObject}
import com.msbjy.scala.musicproject.common.{ConfigUtils, DateUtils}
import org.apache.spark.sql.{SaveMode, SparkSession}
import scala.collection.mutable.ListBuffer
/**
* 生成TW层 TW_SONG_BASEINFO_D 数据表
* 主要是读取Hive中的ODS层 TO_SONG_INFO_D 表生成 TW层 TW_SONG_BASEINFO_D表,
*
*/
object GenerateTwSongBaseinfoD {
val localRun : Boolean = ConfigUtils.LOCAL_RUN
val hiveMetaStoreUris = ConfigUtils.HIVE_METASTORE_URIS
val hiveDataBase = ConfigUtils.HIVE_DATABASE
var sparkSession : SparkSession = _
/**
* 定义使用到的UDF函数 对应的方法
*/
//获取专辑的名称
val getAlbumName : String => String = (albumInfo:String) => {
var albumName = ""
try{
val jsonArray: JSONArray = JSON.parseArray(albumInfo)
albumName = jsonArray.getJSONObject(0).getString("name")
}catch{
case e:Exception =>{
if(albumInfo.contains("《")&&albumInfo.contains("》")){
albumName = albumInfo.substring(albumInfo.indexOf('《'),albumInfo.indexOf('》')+1)
}else{
albumName = "暂无专辑"
}
}
}
albumName
}
//获取 发行时间
val getPostTime : String => String = (postTime:String) =>{
DateUtils.formatDate(postTime)
}
//获取歌手信息
val getSingerInfo : (String,String,String)=> String = (singerInfos:String,singer:String,nameOrId:String) => {
var singerNameOrSingerID = ""
try{
val jsonArray: JSONArray = JSON.parseArray(singerInfos)
if("singer1".equals(singer)&&"name".equals(nameOrId)&&jsonArray.size()>0){
singerNameOrSingerID = jsonArray.getJSONObject(0).getString("name")
}else if("singer1".equals(singer)&&"id".equals(nameOrId)&&jsonArray.size()>0){
singerNameOrSingerID = jsonArray.getJSONObject(0).getString("id")
}else if("singer2".equals(singer)&&"name".equals(nameOrId)&&jsonArray.size()>1){
singerNameOrSingerID = jsonArray.getJSONObject(1).getString("name")
}else if("singer2".equals(singer)&&"id".equals(nameOrId)&&jsonArray.size()>1){
singerNameOrSingerID = jsonArray.getJSONObject(1).getString("id")
}
}catch{
case e:Exception =>{
singerNameOrSingerID
}
}
singerNameOrSingerID
}
//获取 授权公司
val getAuthCompany:String => String = (authCompanyInfo :String) =>{
var authCompanyName = "乐心曲库"
try{
val jsonObject: JSONObject = JSON.parseObject(authCompanyInfo)
authCompanyName = jsonObject.getString("name")
}catch{
case e:Exception=>{
authCompanyName
}
}
authCompanyName
}
//获取产品类型
val getPrdctType : (String =>ListBuffer[Int]) = (productTypeInfo :String) => {
val list = new ListBuffer[Int]()
if(!"".equals(productTypeInfo.trim)){
val strings = productTypeInfo.stripPrefix("[").stripSuffix("]").split(",")
strings.foreach(t=>{
list.append(t.toDouble.toInt)
})
}
list
}
def main(args: Array[String]): Unit = {
if(localRun){//本地运行
sparkSession = SparkSession.builder().master("local").config("hive.metastore.uris",hiveMetaStoreUris).enableHiveSupport().getOrCreate()
}else{//集群运行
sparkSession = SparkSession.builder().enableHiveSupport().getOrCreate()
}
import org.apache.spark.sql.functions._ //导入函数,可以使用 udf、col 方法
//构建转换数据的udf
val udfGetAlbumName = udf(getAlbumName)
val udfGetPostTime = udf(getPostTime)
val udfGetSingerInfo = udf(getSingerInfo)
val udfGetAuthCompany = udf(getAuthCompany)
val udfGetPrdctType = udf(getPrdctType)
sparkSession.sql(s"use $hiveDataBase ")
sparkSession.table("TO_SONG_INFO_D")
//先读ALBUM这一列,再使用udf函数清洗这一列
.withColumn("ALBUM",udfGetAlbumName(col("ALBUM")))
.withColumn("POST_TIME",udfGetPostTime(col("POST_TIME")))
.withColumn("SINGER1",udfGetSingerInfo(col("SINGER_INFO"),lit("singer1"),lit("name")))
.withColumn("SINGER1ID",udfGetSingerInfo(col("SINGER_INFO"),lit("singer1"),lit("id")))
.withColumn("SINGER2",udfGetSingerInfo(col("SINGER_INFO"),lit("singer2"),lit("name")))
.withColumn("SINGER2ID",udfGetSingerInfo(col("SINGER_INFO"),lit("singer2"),lit("id")))
.withColumn("AUTH_CO",udfGetAuthCompany(col("AUTH_CO")))
.withColumn("PRDCT_TYPE",udfGetPrdctType(col("PRDCT_TYPE")))
.createTempView("TEMP_TO_SONG_INFO_D")
/**
* 清洗数据,将结果保存到 Hive TW_SONG_BASEINFO_D 表中
*/
sparkSession.sql(
"""
| select NBR,
| nvl(NAME,OTHER_NAME) as NAME,
| SOURCE,
| ALBUM,
| PRDCT,
| LANG,
| VIDEO_FORMAT,
| DUR,
| SINGER1,
| SINGER2,
| SINGER1ID,
| SINGER2ID,
| 0 as MAC_TIME,
| POST_TIME,
| PINYIN_FST,
| PINYIN,
| SING_TYPE,
| ORI_SINGER,
| LYRICIST,
| COMPOSER,
| BPM_VAL,
| STAR_LEVEL,
| VIDEO_QLTY,
| VIDEO_MK,
| VIDEO_FTUR,
| LYRIC_FTUR,
| IMG_QLTY,
| SUBTITLES_TYPE,
| AUDIO_FMT,
| ORI_SOUND_QLTY,
| ORI_TRK,
| ORI_TRK_VOL,
| ACC_VER,
| ACC_QLTY,
| ACC_TRK_VOL,
| ACC_TRK,
| WIDTH,
| HEIGHT,
| VIDEO_RSVL,
| SONG_VER,
| AUTH_CO,
| STATE,
| case when size(PRDCT_TYPE) =0 then NULL else PRDCT_TYPE end as PRDCT_TYPE
| from TEMP_TO_SONG_INFO_D
| where NBR != ''
""".stripMargin).write.format("Hive").mode(SaveMode.Overwrite).saveAsTable("TW_SONG_BASEINFO_D")
println("**** all finished ****")
}
}
scala文件:GenerateTwSongFturD
package com.msbjy.scala.musicproject.eds.content
import java.util.Properties
import com.msbjy.scala.musicproject.common.{ConfigUtils, DateUtils}
import org.apache.spark.sql.SparkSession
/**
* 统计得到 TW 层的 TW_SONG_FTUR_D 表
* 结合 TW_SONG_BASEINFO_D 表与 TO_CLIENT_SONG_PLAY_OPERATE_REQ_D 表 生成 TW_SONG_FTUR_D
*/
object GenerateTwSongFturD {
private val localRun : Boolean = ConfigUtils.LOCAL_RUN
private val hiveMetaStoreUris = ConfigUtils.HIVE_METASTORE_URIS
private val hiveDataBase = ConfigUtils.HIVE_DATABASE
private var sparkSession : SparkSession = _
def main(args: Array[String]): Unit = {
if(args.length < 1) {
println(s"请输入数据日期,格式例如:年月日(20201231)")
System.exit(1)
}
if(localRun){
sparkSession = SparkSession.builder().master("local")
.appName("Generate_TW_Song_Ftur_D")
.config("spark.sql.shuffle.partitions","1")
.config("hive.metastore.uris",hiveMetaStoreUris)
.enableHiveSupport()
.getOrCreate()
sparkSession.sparkContext.setLogLevel("Error")
}else{
sparkSession = SparkSession.builder().appName("Generate_TW_Song_Ftur_D").enableHiveSupport().getOrCreate()
}
//输入数据的日期 ,格式 :年月日 xxxxxxxx
val analyticDate = args(0)
//获取当前日期的前7天 日期
val per7Date = DateUtils.getCurrentDatePreDate(analyticDate,7)
//获取当前日期的前30天日期
val per30Date = DateUtils.getCurrentDatePreDate(analyticDate,30)
println(s"输入的日期为 :${analyticDate}")
println(s"per7Date = ${per7Date}")
println(s"per30Date = ${per30Date}")
sparkSession.sql(s"use $hiveDataBase ")
/**
* 获取当天歌曲点唱统计
* 基于 TO_CLIENT_SONG_PLAY_OPERATE_REQ_D 当前数据 统计当天歌曲点唱
*/
sparkSession.sql(
s"""
| select
| songid as NBR, --歌曲ID
| count(*) as SING_CNT, --当日点唱量
| 0 as SUPP_CNT , --当日点赞量
| count(distinct uid) as USR_CNT, --当日点唱用户数
| count(distinct order_id) as ORDR_CNT --当日点唱订单数
| from TO_CLIENT_SONG_PLAY_OPERATE_REQ_D
| where data_dt = ${analyticDate}
| group by songid
""".stripMargin).createTempView("currentDayTable")
/**
* 获取近7天的歌曲点唱统计
*/
sparkSession.sql(
s"""
| select
| songid as NBR, --歌曲ID
| count(*) as RCT_7_SING_CNT, --近七天点唱量
| 0 as RCT_7_SUPP_CNT , --近七天点赞量
| count(distinct uid) as RCT_7_USR_CNT, --近七天点唱用户数
| count(distinct order_id) as RCT_7_ORDR_CNT --近七天点唱订单数
| from to_client_song_play_operate_req_d
| where ${per7Date}<= data_dt and data_dt <= ${analyticDate}
| group by songid
""".stripMargin).createTempView("pre7DayTable")
/**
* 获取近30天的歌曲点唱统计
*/
sparkSession.sql(
s"""
| select
| songid as NBR, --歌曲ID
| count(*) as RCT_30_SING_CNT, --近三十天点唱量
| 0 as RCT_30_SUPP_CNT , --近三十天点赞量
| count(distinct uid) as RCT_30_USR_CNT, --近三十天点唱用户数
| count(distinct order_id) as RCT_30_ORDR_CNT --近三十天点唱订单数
| from to_client_song_play_operate_req_d
| where ${per30Date}<= data_dt and data_dt <= ${analyticDate}
| group by songid
""".stripMargin).createTempView("pre30DayTable")
/**
* 从 TW_SONG_FTUR_D 表中获取过去7天和30天中 每首歌曲的 最高点唱量及点赞量
*/
sparkSession.sql(
s"""
| select
| NBR, --歌曲ID
| max(case when DATA_DT BETWEEN ${per7Date} and ${analyticDate} then SING_CNT else 0 end) as RCT_7_TOP_SING_CNT, --近七天最高日点唱量
| max(case when DATA_DT BETWEEN ${per7Date} and ${analyticDate} then SUPP_CNT else 0 end) as RCT_7_TOP_SUPP_CNT, --近七天最高日点赞量
| max(SING_CNT) as RCT_30_TOP_SING_CNT, --近三十天最高日点唱量
| max(SUPP_CNT) as RCT_30_TOP_SUPP_CNT --近三十天最高日点赞量
| from TW_SONG_FTUR_D
| where DATA_DT BETWEEN ${per30Date} and ${analyticDate}
| group by NBR
""".stripMargin).createTempView("pre7And30DayInfoTable")
sparkSession.sql(
s"""
|select
| A.NBR, --歌曲编号
| B.NAME, --歌曲名称
| B.SOURCE, --来源
| B.ALBUM, --所属专辑
| B.PRDCT, --发行公司
| B.LANG, --语言
| B.VIDEO_FORMAT, --视频风格
| B.DUR, --时长
| B.SINGER1, --歌手1
| B.SINGER2, --歌手2
| B.SINGER1ID, --歌手1ID
| B.SINGER2ID, --歌手2ID
| B.MAC_TIME, --加入机器时间
| A.SING_CNT, --当日点唱量
| A.SUPP_CNT, --当日点赞量
| A.USR_CNT, --当日点唱用户数
| A.ORDR_CNT, --当日点唱订单数
| nvl(C.RCT_7_SING_CNT,0) as RCT_7_SING_CNT, --近7天点唱量
| nvl(C.RCT_7_SUPP_CNT,0) as RCT_7_SUPP_CNT, --近7天点赞量
| nvl(E.RCT_7_TOP_SING_CNT,0) as RCT_7_TOP_SING_CNT, --近7天最高点唱量
| nvl(E.RCT_7_TOP_SUPP_CNT,0) as RCT_7_TOP_SUPP_CNT, --近7天最高点赞量
| nvl(C.RCT_7_USR_CNT,0) as RCT_7_USR_CNT, --近7天点唱用户数
| nvl(C.RCT_7_ORDR_CNT,0) as RCT_7_ORDR_CNT, --近7天点唱订单数
| nvl(D.RCT_30_SING_CNT,0) as RCT_30_SING_CNT, --近30天点唱量
| nvl(D.RCT_30_SUPP_CNT,0) as RCT_30_SUPP_CNT, --近30天点赞量
| nvl(E.RCT_30_TOP_SING_CNT,0) as RCT_30_TOP_SING_CNT, --近30天最高点唱量
| nvl(E.RCT_30_TOP_SUPP_CNT,0) as RCT_30_TOP_SUPP_CNT, --近30天最高点赞量
| nvl(D.RCT_30_USR_CNT,0) as RCT_30_USR_CNT, --近30天点唱用户数
| nvl(D.RCT_30_ORDR_CNT,0) as RCT_30_ORDR_CNT --近30天点唱订单数
|from
| currentDayTable A
| JOIN TW_SONG_BASEINFO_D B ON A.NBR = B.NBR
| LEFT JOIN pre7DayTable C ON A.NBR = C.NBR
| LEFT JOIN pre30DayTable D ON A.NBR = D.NBR
| LEFT JOIN pre7And30DayInfoTable E ON A.NBR = E.NBR
""".stripMargin).createTempView("result")
sparkSession.sql(
s"""
| insert overwrite table tw_song_ftur_d partition(data_dt=${analyticDate}) select * from result
""".stripMargin)
println("**** all finished ****")
}
}