1、HiveUtil
package com.likang.lixian.util
import org.apache.spark.sql.SparkSession
object HiveUtil {
/**
* 调大最大分区个数
* @param spark
* @return
*/
def setMaxpartitions(spark: SparkSession)={
spark.sql("set hive.exec.dynamic.partition=true")
spark.sql("set hive.exec.dynamic.partition.mode=nonstrict")
spark.sql("set hive.exec.max.dynamic.partitions=100000")
spark.sql("set hive.exec.max.dynamic.partitions.pernode=100000")
spark.sql("set hive.exec.max.created.files=100000")
}
/**
* 开启压缩
*
* @param spark
* @return
*/
def openCompression(spark: SparkSession) = {
spark.sql("set mapred.output.compress=true")
spark.sql("set hive.exec.compress.output=true")
}
/**
* 开启动态分区,非严格模式
*
* @param spark
*/
def openDynamicPartition(spark: SparkSession) = {
spark.sql("set hive.exec.dynamic.partition=true")
spark.sql("set hive.exec.dynamic.partition.mode=nonstrict")
}
/**
* 使用lzo压缩
*
* @param spark
*/
def useLzoCompression(spark: SparkSession) = {
spark.sql("set io.compression.codec.lzo.class=com.hadoop.compression.lzo.LzoCodec")
spark.sql("set mapred.output.compression.codec=com.hadoop.compression.lzo.LzopCodec")
}
/**
* 使用snappy压缩
* @param spark
*/
def useSnappyCompression(spark:SparkSession)={
spark.sql("set mapreduce.map.output.compress.codec=org.apache.hadoop.io.compress.SnappyCodec");
spark.sql("set mapreduce.output.fileoutputformat.compress=true")
spark.sql("set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.SnappyCodec")
}
}
2、设置jsonUtil
package com.likang.lixian.util;
import com.alibaba.fastjson.JSONObject;
public class ParseJsonData {
public static JSONObject getJsonData(String data) {
try {
return JSONObject.parseObject(data);
} catch (Exception e) {
return null;
}
}
}
3、dwd层
object DwdMemberController {
def main(args: Array[String]): Unit = {
// System.setProperty("HADOOP_USER_NAME", "root")
val sparkConf = new SparkConf().setAppName("dwd_member_import").setMaster("local[*]")
val sparkSession = SparkSession.builder().config(sparkConf).enableHiveSupport().getOrCreate()
val ssc = sparkSession.sparkContext
HiveUtil.openDynamicPartition(sparkSession) //开启动态分区
HiveUtil.openCompression(sparkSession) //开启压缩
HiveUtil.useSnappyCompression(sparkSession) //使用snappy压缩
//对用户原始数据进行数据清洗 存入bdl层表中
EtlDataService.etlBaseAdLog(ssc, sparkSession) //导入基础广告表数据
EtlDataService.etlBaseWebSiteLog(ssc, sparkSession) //导入基础网站表数据
EtlDataService.etlMemberLog(ssc, sparkSession) //清洗用户数据
EtlDataService.etlMemberRegtypeLog(ssc, sparkSession) //清洗用户注册数据
EtlDataService.etlMemPayMoneyLog(ssc, sparkSession) //导入用户支付情况记录
EtlDataService.etlMemVipLevelLog(ssc, sparkSession) //导入vip基础数据
}
}
/**
* 导入广告表基础数据
*
* @param ssc
* @param sparkSession
*/
def etlBaseAdLog(ssc: SparkContext, sparkSession: SparkSession) = {
import sparkSession.implicits._ //隐式转换
val result = ssc.textFile("hdfs://node01:9000/user/shucang/ods/baseadlog.log").filter(item => {
val obj = ParseJsonData.getJsonData(item)
obj.isInstanceOf[JSONObject]
}).mapPartitions(partition => {
partition.map(item => {
val jsonObject = ParseJsonData.getJsonData(item)
val adid = jsonObject.getIntValue("adid")
val adname = jsonObject.getString("adname")
val dn = jsonObject.getString("dn")
(adid, adname, dn)
})
}).toDF().coalesce(1).write.mode(SaveMode.Overwrite).insertInto("dwd.dwd_base_ad")
}
dws层
object DwsMemberController {
def main(args: Array[String]): Unit = {
System.setProperty("HADOOP_USER_NAME", "root")
val sparkConf = new SparkConf().setAppName("dws_member_import").setMaster("local[*]")
// .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
// .registerKryoClasses(Array(classOf[DwsMember]))
val sparkSession = SparkSession.builder().config(sparkConf).enableHiveSupport().getOrCreate()
val ssc = sparkSession.sparkContext
HiveUtil.openDynamicPartition(sparkSession) //开启动态分区
HiveUtil.openCompression(sparkSession) //开启压缩
HiveUtil.useSnappyCompression(sparkSession) //使用snappy压缩
DwsMemberService.importMember(sparkSession, "20190722") //根据用户信息聚合用户表数据
// DwsMemberService.importMemberUseApi(sparkSession, "20190722")
}
}
dws-server层
object DwsMemberService {
def importMemberUseApi(sparkSession: SparkSession, dt: String) = {
import sparkSession.implicits._ //隐式转换
val dwdMember = DwdMemberDao.getDwdMember(sparkSession).where(s"dt='${dt}'") //主表用户表
val dwdMemberRegtype = DwdMemberDao.getDwdMemberRegType(sparkSession)
val dwdBaseAd = DwdMemberDao.getDwdBaseAd(sparkSession)
val dwdBaseWebsite = DwdMemberDao.getDwdBaseWebSite(sparkSession)
val dwdPcentermemPaymoney = DwdMemberDao.getDwdPcentermemPayMoney(sparkSession)
val dwdVipLevel = DwdMemberDao.getDwdVipLevel(sparkSession)
// import org.apache.spark.sql.functions.broadcast
val result = dwdMember.join(dwdMemberRegtype, Seq("uid", "dn"), "left")
.join(dwdBaseAd, Seq("ad_id", "dn"), "left_outer")
.join(dwdBaseWebsite, Seq("siteid", "dn"), "left_outer")
.join(dwdPcentermemPaymoney, Seq("uid", "dn"), "left_outer")
.join(dwdVipLevel, Seq("vip_id", "dn"), "left_outer")
.select("uid", "ad_id", "fullname", "iconurl", "lastlogin", "mailaddr", "memberlevel", "password"
, "paymoney", "phone", "qq", "register", "regupdatetime", "unitname", "userip", "zipcode", "appkey"
, "appregurl", "bdp_uuid", "reg_createtime", "domain", "isranreg", "regsource", "regsourcename", "adname"
, "siteid", "sitename", "siteurl", "site_delete", "site_createtime", "site_creator", "vip_id", "vip_level",
"vip_start_time", "vip_end_time", "vip_last_modify_time", "vip_max_free", "vip_min_free", "vip_next_level"
, "vip_operator", "dt", "dn").as[DwsMember]
val resultData = result.groupByKey(item => item.uid + "_" + item.dn)
.mapGroups { case (key, iters) =>
val keys = key.split("_")
val uid = Integer.parseInt(keys(0))
val dn = keys(1)
val dwsMembers = iters.toList
val paymoney = dwsMembers.filter(_.paymoney != null).map(_.paymoney).reduceOption(_ + _).getOrElse(BigDecimal.apply(0.00)).toString
val ad_id = dwsMembers.map(_.ad_id).head
val fullname = dwsMembers.map(_.fullname).head
val icounurl = dwsMembers.map(_.iconurl).head
val lastlogin = dwsMembers.map(_.lastlogin).head
val mailaddr = dwsMembers.map(_.mailaddr).head
val memberlevel = dwsMembers.map(_.memberlevel).head
val password = dwsMembers.map(_.password).head
DwsMember_Result(uid, ad_id, fullname, icounurl, lastlogin, mailaddr, memberlevel, password, paymoney,vip_end_time, vip_last_modify_time, vip_max_free, vip_min_free,vip_next_level, vip_operator, dt, dn)
}
resultData.show()
}
def importMember(sparkSession: SparkSession, time: String) = {
import sparkSession.implicits._ //隐式转换
//查询全量数据 刷新到宽表
sparkSession.sql(s"select uid,first(ad_id),first(fullname),first(iconurl),first(lastlogin)," +
"first(mailaddr),first(memberlevel),first(password),sum(cast(paymoney as decimal(10,4))),first(phone),first(qq)," +
"first(register),first(regupdatetime),first(unitname),first(userip),first(zipcode)," +
"first(appkey),first(appregurl),first(bdp_uuid),first(reg_createtime),first(domain)," +
"first(isranreg),first(regsource),first(regsourcename),first(adname),first(siteid),first(sitename)," +
"first(siteurl),first(site_delete),first(site_createtime),first(site_creator),first(vip_id),max(vip_level)," +
"min(vip_start_time),max(vip_end_time),max(vip_last_modify_time),first(vip_max_free),first(vip_min_free),max(vip_next_level)," +
"first(vip_operator),dt,dn from" +
"(select a.uid,a.ad_id,a.fullname,a.iconurl,a.lastlogin,a.mailaddr,a.memberlevel," +
"a.password,e.paymoney,a.phone,a.qq,a.register,a.regupdatetime,a.unitname,a.userip," +
"a.zipcode,a.dt,b.appkey,b.appregurl,b.bdp_uuid,b.createtime as reg_createtime,b.domain,b.isranreg,b.regsource," +
"b.regsourcename,c.adname,d.siteid,d.sitename,d.siteurl,d.delete as site_delete,d.createtime as site_createtime," +
"d.creator as site_creator,f.vip_id,f.vip_level,f.start_time as vip_start_time,f.end_time as vip_end_time," +
"f.last_modify_time as vip_last_modify_time,f.max_free as vip_max_free,f.min_free as vip_min_free," +
"f.next_level as vip_next_level,f.operator as vip_operator,a.dn " +
s"from dwd.dwd_member a left join dwd.dwd_member_regtype b on a.uid=b.uid " +
"and a.dn=b.dn left join dwd.dwd_base_ad c on a.ad_id=c.adid and a.dn=c.dn left join " +
" dwd.dwd_base_website d on b.websiteid=d.siteid and b.dn=d.dn left join dwd.dwd_pcentermempaymoney e" +
s" on a.uid=e.uid and a.dn=e.dn left join dwd.dwd_vip_level f on e.vip_id=f.vip_id and e.dn=f.dn where a.dt='${time}')r " +
"group by uid,dn,dt").coalesce(1).write.mode(SaveMode.Overwrite).insertInto("dws.dws_member")
//查询当天增量数据
val dayResult = sparkSession.sql(s"select a.uid,sum(cast(a.paymoney as decimal(10,4))) as paymoney,max(b.vip_level) as vip_level," +
s"from_unixtime(unix_timestamp('$time','yyyyMMdd'),'yyyy-MM-dd') as start_time,'9999-12-31' as end_time,first(a.dn) as dn " +
" from dwd.dwd_pcentermempaymoney a join " +
s"dwd.dwd_vip_level b on a.vip_id=b.vip_id and a.dn=b.dn where a.dt='$time' group by uid").as[MemberZipper]
//查询历史拉链表数据
val historyResult = sparkSession.sql("select * from dws.dws_member_zipper").as[MemberZipper]
//两份数据根据用户id进行聚合 对 end_time 进行重新修改
val reuslt = dayResult.union(historyResult).groupByKey(item => item.uid + "_" + item.dn)
//mapgroups 对数据进行重组(key,itor)
.mapGroups { case (key, iters) =>
val keys = key.split("_")
val uid = keys(0)
val dn = keys(1)
val list = iters.toList.sortBy(item => item.start_time) //对开始时间进行排序
if (list.size > 1 && "9999-12-31".equals(list(list.size - 2).end_time)) {
//如果存在历史数据 需要对历史数据的end_time进行修改
//获取历史数据的最后一条数据
val oldLastModel = list(list.size - 2)
//获取当前时间最后一条数据
val lastModel = list(list.size - 1)
oldLastModel.end_time = lastModel.start_time
lastModel.paymoney = (BigDecimal.apply(lastModel.paymoney) + BigDecimal(oldLastModel.paymoney)).toString()
}
MemberZipperResult(list)
}.flatMap(_.list).coalesce(3).write.mode(SaveMode.Overwrite).insertInto("dws.dws_member_zipper") //重组对象打散 刷新拉链表
}
}
ads -controller
object AdsMemberController {
def main(args: Array[String]): Unit = {
System.setProperty("HADOOP_USER_NAME", "root")
val sparkConf = new SparkConf().setAppName("ads_member_controller").setMaster("local[*]")
val sparkSession = SparkSession.builder().config(sparkConf).enableHiveSupport().getOrCreate()
val ssc = sparkSession.sparkContext
HiveUtil.openDynamicPartition(sparkSession) //开启动态分区
AdsMemberService.queryDetailApi(sparkSession, "20190722")
AdsMemberService.queryDetailSql(sparkSession, "20190722")
}
}
ads-server-api
object AdsMemberService {
/**
* 统计各项指标 使用api
*
* @param sparkSession
*/
def queryDetailApi(sparkSession: SparkSession, dt: String) = {
import sparkSession.implicits._ //隐式转换
val result: Dataset[QueryResult] = DwsMemberDao.queryIdlMemberData(sparkSession).as[QueryResult].where(s"dt='${dt}'")
result.cache()
//统计注册来源url人数
val a = result.mapPartitions(partition => {
partition.map(item => (item.appregurl + "_" + item.dn + "_" + item.dt, 1))
}).groupByKey(_._1)
.mapValues(item => item._2).reduceGroups(_ + _)
.map(item => {
val keys = item._1.split("_")
val appregurl = keys(0)
val dn = keys(1)
val dt = keys(2)
(appregurl, item._2, dt, dn)
}).toDF().coalesce(1).write.mode(SaveMode.Overwrite).insertInto("ads.ads_register_appregurlnum")
//统计所属网站人数
result.mapPartitions(partiton => {
partiton.map(item => (item.sitename + "_" + item.dn + "_" + item.dt, 1))
}).groupByKey(_._1).mapValues((item => item._2)).reduceGroups(_ + _)
.map(item => {
val keys = item._1.split("_")
val sitename = keys(0)
val dn = keys(1)
val dt = keys(2)
(sitename, item._2, dt, dn)
}).toDF().coalesce(1).write.mode(SaveMode.Overwrite).insertInto("ads.ads_register_sitenamenum")
//统计所属来源人数 pc mobile wechat app
result.mapPartitions(partition => {
partition.map(item => (item.regsourcename + "_" + item.dn + "_" + item.dt, 1))
}).groupByKey(_._1).mapValues(item => item._2).reduceGroups(_ + _)
.map(item => {
val keys = item._1.split("_")
val regsourcename = keys(0)
val dn = keys(1)
val dt = keys(2)
(regsourcename, item._2, dt, dn)
}).toDF().coalesce(1).write.mode(SaveMode.Overwrite).insertInto("ads.ads_register_regsourcenamenum")
//统计通过各广告进来的人数
result.mapPartitions(partition => {
partition.map(item => (item.adname + "_" + item.dn + "_" + item.dt, 1))
}).groupByKey(_._1).mapValues(_._2).reduceGroups(_ + _)
.map(item => {
val keys = item._1.split("_")
val adname = keys(0)
val dn = keys(1)
val dt = keys(2)
(adname, item._2, dt, dn)
}).toDF().coalesce(1).write.mode(SaveMode.Overwrite).insertInto("ads.ads_register_adnamenum")
//统计各用户等级人数
result.mapPartitions(partition => {
partition.map(item => (item.memberlevel + "_" + item.dn + "_" + item.dt, 1))
}).groupByKey(_._1).mapValues(_._2).reduceGroups(_ + _)
.map(item => {
val keys = item._1.split("_")
val memberlevel = keys(0)
val dn = keys(1)
val dt = keys(2)
(memberlevel, item._2, dt, dn)
}).toDF().coalesce(1).write.mode(SaveMode.Overwrite).insertInto("ads.ads_register_memberlevelnum")
//统计各用户vip等级人数
result.mapPartitions(partition => {
partition.map(item => (item.vip_level + "_" + item.dn + "_" + item.dt, 1))
}).groupByKey(_._1).mapValues(_._2).reduceGroups(_ + _)
.map(item => {
val keys = item._1.split("_")
val vip_level = keys(0)
val dn = keys(1)
val dt = keys(2)
(vip_level, item._2, dt, dn)
}).toDF().coalesce(1).write.mode(SaveMode.Overwrite).insertInto("ads.ads_register_viplevelnum")
//统计各memberlevel等级 支付金额前三的用户
import org.apache.spark.sql.functions._
result.withColumn("rownum", row_number().over(Window.partitionBy("dn", "memberlevel").orderBy(desc("paymoney"))))
.where("rownum<4").orderBy("memberlevel", "rownum")
.select("uid", "memberlevel", "register", "appregurl", "regsourcename", "adname"
, "sitename", "vip_level", "paymoney", "rownum", "dt", "dn")
.coalesce(1).write.mode(SaveMode.Overwrite).insertInto("ads.ads_register_top3memberpay")
}
/**
* 统计各项指标 使用sql
*
* @param sparkSession
*/
def queryDetailSql(sparkSession: SparkSession, dt: String) = {
val appregurlCount = DwsMemberDao.queryAppregurlCount(sparkSession, dt)
val siteNameCount = DwsMemberDao.querySiteNameCount(sparkSession, dt)
val regsourceNameCount = DwsMemberDao.queryRegsourceNameCount(sparkSession, dt)
val adNameCount = DwsMemberDao.queryAdNameCount(sparkSession, dt)
val memberLevelCount = DwsMemberDao.queryMemberLevelCount(sparkSession, dt)
val vipLevelCount = DwsMemberDao.queryVipLevelCount(sparkSession, dt).show()
val top3MemberLevelPayMoneyUser = DwsMemberDao.getTop3MemberLevelPayMoneyUser(sparkSession, dt)
}
}
ads-sql
object DwsMemberDao {
/**
* 查询用户宽表数据
*
* @param sparkSession
* @return
*/
def queryIdlMemberData(sparkSession: SparkSession) = {
val frame: DataFrame = sparkSession.sql("select uid,ad_id,memberlevel,register,appregurl,regsource,regsourcename,adname," +
"siteid,sitename,vip_level,cast(paymoney as decimal(10,4)) as paymoney,dt,dn from dws.dws_member ")
frame
}
/**
* 统计注册来源url人数
*
* @param sparkSession
*/
def queryAppregurlCount(sparkSession: SparkSession, dt: String) = {
sparkSession.sql(s"select appregurl,count(uid),dn,dt from dws.dws_member where dt='${dt}' group by appregurl,dn,dt")
}
//统计所属网站人数
def querySiteNameCount(sparkSession: SparkSession, dt: String) = {
sparkSession.sql(s"select sitename,count(uid),dn,dt from dws.dws_member where dt='${dt}' group by sitename,dn,dt")
}
//统计所属来源人数
def queryRegsourceNameCount(sparkSession: SparkSession, dt: String) = {
sparkSession.sql(s"select regsourcename,count(uid),dn,dt from dws.dws_member where dt='${dt}' group by regsourcename,dn,dt ")
}
//统计通过各广告注册的人数
def queryAdNameCount(sparkSession: SparkSession, dt: String) = {
sparkSession.sql(s"select adname,count(uid),dn,dt from dws.dws_member where dt='${dt}' group by adname,dn,dt")
}
//统计各用户等级人数
def queryMemberLevelCount(sparkSession: SparkSession, dt: String) = {
sparkSession.sql(s"select memberlevel,count(uid),dn,dt from dws.dws_member where dt='${dt}' group by memberlevel,dn,dt")
}
//统计各用户vip等级人数
def queryVipLevelCount(sparkSession: SparkSession, dt: String) = {
sparkSession.sql(s"select vip_level,count(uid),dn,dt from dws.dws_member group where dt='${dt}' group by vip_level,dn,dt")
}
//统计各memberlevel等级 支付金额前三的用户
def getTop3MemberLevelPayMoneyUser(sparkSession: SparkSession, dt: String) = {
sparkSession.sql("select *from(select uid,ad_id,memberlevel,register,appregurl,regsource" +
",regsourcename,adname,siteid,sitename,vip_level,cast(paymoney as decimal(10,4)),row_number() over" +
s" (partition by dn,memberlevel order by cast(paymoney as decimal(10,4)) desc) as rownum,dn from dws.dws_member where dt='${dt}') " +
" where rownum<4 order by memberlevel,rownum")
}
}