数据来源有三张表:
user_visit_action,描述用户点击信息,存储在hive中
模拟数据如下:有效信息date,点击时间,city_id城市,order_product_id用户点击商品的id
-
date,user_id,session_id,page_id,action,city_id,search_keywords,order_product_id
-
2018/5/1,,,,,010,,1011
-
2018/5/1,,,,,010,,1011
-
2018/5/1,,,,,010,,1011
-
2018/5/1,,,,,010,,1011
-
2018/5/1,,,,,011,,1011
-
2018/5/1,,,,,011,,1011
-
2018/5/2,,,,,011,,
-
2018/5/2,,,,,011,,null
-
2018/5/2,,,,,011,,1022
-
2018/5/2,,,,,011,,
-
2018/5/2,,,,,010,,1033
-
2018/5/2,,,,,010,,1034
-
2018/5/2,,,,,012,,1035
city_info,描述城市信息,存储在MySQL中
数据描述:城市id,城市名称,所属区域
-
city_id,city_name,area
-
010,beijing,huazhong
-
011,tianjin,huazhong
-
012,hebei,huazhong
-
020,shanghai,huanan
-
021,nanjing,huanan
-
022,shandong,huanan
-
030,haerbin,dongbei
-
031,jilin,dongbei
-
032,shenyang,dongbei
roducet_info,描述商品信息,存储在hive中
数据描述,商品id,商品名称,扩展信息
-
product_id,product_name,extends_info
-
1011,aa,{"product_status":0,"price":100}
-
1022,bb,{"product_status":1,"price":101}
-
1033,cc,{"product_status":0,"price":102}
-
1034,dd,{"product_status":1,"price":103}
-
1035,ee,{"product_status":0,"price":104}
-
1036,ff,{"product_status":1,"price":105}
-
1037,gg,{"product_status":0,"price":106}
-
1038,hh,{"product_status":0,"price":107}
-
1039,ii,{"product_status":1,"price":108}
-
1040,jj,{"product_status":0,"price":109}
-
1041,kk,{"product_status":1,"price":110}
-
1042,ll,{"product_status":0,"price":111}
案例需求:求出不同区域的热点商品,求top3
额外要求:增加字段描述商品对应所在区域的城市名称,增加商品来源(product_status)
主体代码实现
-
package day6.test1.day9
-
import java.util.Properties
-
import org.apache.spark.rdd.RDD
-
import org.apache.spark.sql.{Row, SaveMode}
-
import org.apache.spark.sql.hive.HiveContext
-
import org.apache.spark.{SparkConf, SparkContext}
-
object HotGoodsMT{
-
def main(args: Array[String]): Unit = {
-
/**
-
* 第一步 创建程序入口
-
*/
-
val conf = new SparkConf().setMaster("local")
-
.setAppName(s"${this.getClass.getSimpleName}")
-
val sc = new SparkContext(conf)
-
val sqlContext = new HiveContext(sc)
-
//自定义的聚合函数
-
// sqlContext.udf.register("group_dinstict_func",GroupDinstictUDAF)
-
sqlContext.udf.register("get_product_status",(str:String) =>{//json
-
var status=0;
-
for( s <- str.split(",")){
-
if(s.contains("product_status")){
-
status=s.split(":")(1).toInt
-
}
-
}
-
if (status == 0)"自营" else "第三方"
-
})
-
/**
-
* 第二步 接收外面传进来的参数
-
* 起始时间 startTime
-
* 结束时间 endTime
-
*/
-
val Array(startTime,endTime)=Array("2018/5/1","2018/5/4")//args
-
/**
-
* 第三步:
-
* 根据时间去Hive表里面获取用户行为的数据
-
* 注册成为临时信息表cityid2Action
-
*/
-
val cityid2Actiontable = "cityid2Action"
-
val cityid2Action
-
= getActionDataByRange(startTime,endTime,sqlContext,cityid2Actiontable)
-
/**
-
* 【接下来我做的这些事,不一定是最好的选择】,大家可以按照自己思路做。
-
* 第四步:
-
* 获取城市信息表(mysql)
-
* 注册为临时信息表cityid2City
-
*/
-
val cityid2Citytable = "cityid2City"
-
val mysql_tablename = "city_info"
-
val cityid2City = getCityInfo(sqlContext,cityid2Citytable,mysql_tablename)
-
/**
-
* 第五步:
-
* 第三步和第四步的表进行 join (cityid) -> table
-
* cityid2Action
-
* cityid2City
-
*/
-
val area2ActionTable = "areaaction"
-
generateTempTable(sqlContext,cityid2Actiontable,cityid2Citytable,area2ActionTable)
-
/**
-
* 第六步:
-
* 统计商品点击的次数
-
* 增加需求,将商品在不同区域的城市合并,添加为新字段
-
*/
-
//注册自定义的函数
-
sqlContext.udf.register("combaincity",MyUDAT_cityCombain)
-
val area2citynameactiontable = "areacombaincityname"
-
productClickCountByArea(sqlContext,area2citynameactiontable)
-
/**
-
* 第七步:
-
* 分组求TopN
-
* 新增分级字段
-
*/
-
val topNtable = "topN"
-
topN(sqlContext,topNtable)
-
val leveltable = "topNlevel"
-
levelTopN(sqlContext,leveltable)
-
/**
-
* 第八步:
-
* 上一步的结果 跟 商品信息表进行join
-
*/
-
genarateFullTable(sqlContext)
-
/**
-
* 第九步:
-
* 把最后的结果写入mysql数据库
-
*/
-
// writeresult(sqlContext)
-
}
-
def getActionDataByRange(startTime: String, endTime: String
-
, sqlContext: HiveContext,table_name:String) = {
-
val sql =
-
s"""
-
select
-
city_id,order_product_id
-
from
-
myhive.user_visit_action
-
where
-
order_product_id !=''
-
and
-
order_product_id != 'null'
-
and
-
order_product_id != 'NULL'
-
and
-
datee >= '${startTime}'
-
and
-
datee <= '${endTime}'
-
"""
-
val df = sqlContext.sql(sql)
-
df.createOrReplaceTempView(table_name)
-
}
-
//---------------------
-
def getCityInfo(sqlContext: HiveContext,tablename:String,mysqltable:String) = {
-
val url = "jdbc:mysql://hadoop02:3306/test"
-
val df = sqlContext.read.format("jdbc")
-
.option("url", url)
-
.option("dbtable", mysqltable)
-
.option("user", "root")
-
.option("password", "root")
-
.load()
-
df.createOrReplaceTempView(tablename)
-
}
-
def generateTempTable(sqlContext: HiveContext, cityid2Actiontable: String
-
, cityid2Citytable: String,area2action:String): Unit = {
-
val sql =
-
s"""
-
select
-
b.city_id,b.order_product_id,a.area,a.city_name
-
from cityid2City a
-
left join
-
cityid2Action b
-
on
-
a.city_id = b.city_id
-
"""
-
val df = sqlContext.sql(sql)
-
df.createOrReplaceTempView(area2action)
-
}
-
def productClickCountByArea(sqlContext: HiveContext
-
, area2citynameactiontable: String): Unit = {
-
val sql=
-
"""
-
select area,order_product_id,count(*)as clicknum,combaincity(city_name)as city_name
-
from areaaction
-
group by area,order_product_id
-
"""
-
val df = sqlContext.sql(sql)
-
df.createOrReplaceTempView(area2citynameactiontable)
-
}
-
def topN(sqlContext: HiveContext,topNtable:String): Unit = {
-
val sql =
-
"""
-
select area,order_product_id,city_name,clicknum
-
from
-
(select
-
area,order_product_id,city_name,clicknum,
-
row_number() over (partition by area sort by clicknum desc) as rank
-
from
-
areacombaincityname)a
-
where a.rank <= 3
-
"""
-
val df = sqlContext.sql(sql)
-
df.createOrReplaceTempView(topNtable)
-
}
-
def levelTopN(sqlContext: HiveContext, leveltable: String): Unit = {
-
val sql=
-
"""
-
select area,
-
case when area="huazhong" then "A"
-
when area="huanan"then "B"
-
when area="dongbei"then "C"
-
else "D"
-
end as level,order_product_id,city_name,clicknum
-
from topN
-
"""
-
val df = sqlContext.sql(sql)
-
df.createOrReplaceTempView(leveltable)
-
}
-
def genarateFullTable(sqlContext: HiveContext): Unit = {
-
val sql =
-
"""
-
select area,level,order_product_id,city_name
-
,clicknum,product_name,get_product_status(extends_info)
-
from topNlevel a,myhive.product_info b
-
where a.order_product_id = b.product_id
-
"""
-
val df = sqlContext.sql(sql)
-
df.createOrReplaceTempView("result")
-
val url = "jdbc:mysql://hadoop02:3306/test"
-
val table = "resultFull"
-
val properties = new Properties()
-
properties.put("user","root")
-
properties.put("password","root")
-
df.write.mode(SaveMode.Overwrite).jdbc(url,table,properties)
-
}
-
}
求区域商品对应的城市信息需要使用自定义udaf 函数
-
package day6.test1.day9
-
import org.apache.spark.sql.Row
-
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
-
import org.apache.spark.sql.types.{DataType, StringType, StructField, StructType}
-
object MyUDAT_cityCombain extends UserDefinedAggregateFunction{
-
override def inputSchema: StructType = StructType(
-
StructField("city_info",StringType,true)::Nil
-
)
-
override def bufferSchema: StructType = StructType(
-
StructField("city_info",StringType,true)::Nil
-
)
-
override def dataType: DataType = StringType
-
override def deterministic: Boolean = true
-
override def initialize(buffer: MutableAggregationBuffer): Unit = {
-
buffer.update(0,"")
-
}
-
override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
-
var last_str = buffer.getString(0)
-
val current_str = input.getString(0)
-
if(!last_str.contains(current_str)){
-
if (last_str.equals("")){
-
last_str = current_str
-
}else{
-
last_str += "," + current_str
-
}
-
}
-
buffer.update(0,last_str)
-
}
-
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
-
var b1 = buffer1.getString(0)
-
val b2 = buffer2.getString(0)
-
for(s <- b2.split(",")){
-
if (!b1.contains(s)){
-
if (b1.equals("")){
-
b1 = s
-
}else{
-
b1 += "," + s
-
}
-
}
-
}
-
buffer1.update(0,b1)
-
}
-
override def evaluate(buffer: Row): Any = {
-
buffer.getString(0)
-
}
-
}
-----------------------------------------------------------------------------------------------------------
程序运行,提前将表存储到对应的数据库中
将程序打包,详细图解过程如下:
-------------------------------------------------------
将打包好的jar包上传到spark集群,使用命令运行
spark-submit --class day6.test1.day9.HotGoodsMT --master local[4] meituan.jar
最终结果保存在MySQL中,结果如下: