Spark案例分析——不同区域热点商品分析udf

数据来源有三张表:

user_visit_action,描述用户点击信息,存储在hive中

模拟数据如下:有效信息date,点击时间,city_id城市,order_product_id用户点击商品的id

 
  1. date,user_id,session_id,page_id,action,city_id,search_keywords,order_product_id

  2. 2018/5/1,,,,,010,,1011

  3. 2018/5/1,,,,,010,,1011

  4. 2018/5/1,,,,,010,,1011

  5. 2018/5/1,,,,,010,,1011

  6. 2018/5/1,,,,,011,,1011

  7. 2018/5/1,,,,,011,,1011

  8. 2018/5/2,,,,,011,,

  9. 2018/5/2,,,,,011,,null

  10. 2018/5/2,,,,,011,,1022

  11. 2018/5/2,,,,,011,,

  12. 2018/5/2,,,,,010,,1033

  13. 2018/5/2,,,,,010,,1034

  14. 2018/5/2,,,,,012,,1035

city_info,描述城市信息,存储在MySQL中

数据描述:城市id,城市名称,所属区域

 
  1. city_id,city_name,area

  2. 010,beijing,huazhong

  3. 011,tianjin,huazhong

  4. 012,hebei,huazhong

  5. 020,shanghai,huanan

  6. 021,nanjing,huanan

  7. 022,shandong,huanan

  8. 030,haerbin,dongbei

  9. 031,jilin,dongbei

  10. 032,shenyang,dongbei

roducet_info,描述商品信息,存储在hive中

数据描述,商品id,商品名称,扩展信息

 
  1. product_id,product_name,extends_info

  2. 1011,aa,{"product_status":0,"price":100}

  3. 1022,bb,{"product_status":1,"price":101}

  4. 1033,cc,{"product_status":0,"price":102}

  5. 1034,dd,{"product_status":1,"price":103}

  6. 1035,ee,{"product_status":0,"price":104}

  7. 1036,ff,{"product_status":1,"price":105}

  8. 1037,gg,{"product_status":0,"price":106}

  9. 1038,hh,{"product_status":0,"price":107}

  10. 1039,ii,{"product_status":1,"price":108}

  11. 1040,jj,{"product_status":0,"price":109}

  12. 1041,kk,{"product_status":1,"price":110}

  13. 1042,ll,{"product_status":0,"price":111}

案例需求:求出不同区域的热点商品,求top3

额外要求:增加字段描述商品对应所在区域的城市名称,增加商品来源(product_status)

主体代码实现

 
  1. package day6.test1.day9

  2. import java.util.Properties

  3. import org.apache.spark.rdd.RDD

  4. import org.apache.spark.sql.{Row, SaveMode}

  5. import org.apache.spark.sql.hive.HiveContext

  6. import org.apache.spark.{SparkConf, SparkContext}

  7.  
  8. object HotGoodsMT{

  9. def main(args: Array[String]): Unit = {

  10. /**

  11. * 第一步 创建程序入口

  12. */

  13. val conf = new SparkConf().setMaster("local")

  14. .setAppName(s"${this.getClass.getSimpleName}")

  15. val sc = new SparkContext(conf)

  16. val sqlContext = new HiveContext(sc)

  17. //自定义的聚合函数

  18. // sqlContext.udf.register("group_dinstict_func",GroupDinstictUDAF)

  19. sqlContext.udf.register("get_product_status",(str:String) =>{//json

  20. var status=0;

  21. for( s <- str.split(",")){

  22. if(s.contains("product_status")){

  23. status=s.split(":")(1).toInt

  24. }

  25. }

  26. if (status == 0)"自营" else "第三方"

  27. })

  28.  
  29. /**

  30. * 第二步 接收外面传进来的参数

  31. * 起始时间 startTime

  32. * 结束时间 endTime

  33. */

  34. val Array(startTime,endTime)=Array("2018/5/1","2018/5/4")//args

  35. /**

  36. * 第三步:

  37. * 根据时间去Hive表里面获取用户行为的数据

  38. * 注册成为临时信息表cityid2Action

  39. */

  40. val cityid2Actiontable = "cityid2Action"

  41. val cityid2Action

  42. = getActionDataByRange(startTime,endTime,sqlContext,cityid2Actiontable)

  43.  
  44.  
  45. /**

  46. * 【接下来我做的这些事,不一定是最好的选择】,大家可以按照自己思路做。

  47. * 第四步:

  48. * 获取城市信息表(mysql)

  49. * 注册为临时信息表cityid2City

  50. */

  51. val cityid2Citytable = "cityid2City"

  52. val mysql_tablename = "city_info"

  53. val cityid2City = getCityInfo(sqlContext,cityid2Citytable,mysql_tablename)

  54.  
  55. /**

  56. * 第五步:

  57. * 第三步和第四步的表进行 join (cityid) -> table

  58. * cityid2Action

  59. * cityid2City

  60. */

  61.  
  62. val area2ActionTable = "areaaction"

  63. generateTempTable(sqlContext,cityid2Actiontable,cityid2Citytable,area2ActionTable)

  64. /**

  65. * 第六步:

  66. * 统计商品点击的次数

  67. * 增加需求,将商品在不同区域的城市合并,添加为新字段

  68. */

  69. //注册自定义的函数

  70. sqlContext.udf.register("combaincity",MyUDAT_cityCombain)

  71. val area2citynameactiontable = "areacombaincityname"

  72. productClickCountByArea(sqlContext,area2citynameactiontable)

  73. /**

  74. * 第七步:

  75. * 分组求TopN

  76. * 新增分级字段

  77. */

  78. val topNtable = "topN"

  79. topN(sqlContext,topNtable)

  80. val leveltable = "topNlevel"

  81. levelTopN(sqlContext,leveltable)

  82.  
  83. /**

  84. * 第八步:

  85. * 上一步的结果 跟 商品信息表进行join

  86. */

  87. genarateFullTable(sqlContext)

  88.  
  89. /**

  90. * 第九步:

  91. * 把最后的结果写入mysql数据库

  92. */

  93. // writeresult(sqlContext)

  94. }

  95. def getActionDataByRange(startTime: String, endTime: String

  96. , sqlContext: HiveContext,table_name:String) = {

  97. val sql =

  98. s"""

  99. select

  100. city_id,order_product_id

  101. from

  102. myhive.user_visit_action

  103. where

  104. order_product_id !=''

  105. and

  106. order_product_id != 'null'

  107. and

  108. order_product_id != 'NULL'

  109. and

  110. datee >= '${startTime}'

  111. and

  112. datee <= '${endTime}'

  113. """

  114. val df = sqlContext.sql(sql)

  115. df.createOrReplaceTempView(table_name)

  116. }

  117. //---------------------

  118. def getCityInfo(sqlContext: HiveContext,tablename:String,mysqltable:String) = {

  119. val url = "jdbc:mysql://hadoop02:3306/test"

  120. val df = sqlContext.read.format("jdbc")

  121. .option("url", url)

  122. .option("dbtable", mysqltable)

  123. .option("user", "root")

  124. .option("password", "root")

  125. .load()

  126. df.createOrReplaceTempView(tablename)

  127. }

  128. def generateTempTable(sqlContext: HiveContext, cityid2Actiontable: String

  129. , cityid2Citytable: String,area2action:String): Unit = {

  130. val sql =

  131. s"""

  132. select

  133. b.city_id,b.order_product_id,a.area,a.city_name

  134. from cityid2City a

  135. left join

  136. cityid2Action b

  137. on

  138. a.city_id = b.city_id

  139.  
  140. """

  141. val df = sqlContext.sql(sql)

  142. df.createOrReplaceTempView(area2action)

  143.  
  144. }

  145.  
  146. def productClickCountByArea(sqlContext: HiveContext

  147. , area2citynameactiontable: String): Unit = {

  148. val sql=

  149. """

  150. select area,order_product_id,count(*)as clicknum,combaincity(city_name)as city_name

  151. from areaaction

  152. group by area,order_product_id

  153. """

  154. val df = sqlContext.sql(sql)

  155. df.createOrReplaceTempView(area2citynameactiontable)

  156. }

  157.  
  158. def topN(sqlContext: HiveContext,topNtable:String): Unit = {

  159. val sql =

  160. """

  161. select area,order_product_id,city_name,clicknum

  162. from

  163. (select

  164. area,order_product_id,city_name,clicknum,

  165. row_number() over (partition by area sort by clicknum desc) as rank

  166. from

  167. areacombaincityname)a

  168. where a.rank <= 3

  169. """

  170. val df = sqlContext.sql(sql)

  171. df.createOrReplaceTempView(topNtable)

  172. }

  173. def levelTopN(sqlContext: HiveContext, leveltable: String): Unit = {

  174. val sql=

  175. """

  176. select area,

  177. case when area="huazhong" then "A"

  178. when area="huanan"then "B"

  179. when area="dongbei"then "C"

  180. else "D"

  181. end as level,order_product_id,city_name,clicknum

  182. from topN

  183. """

  184. val df = sqlContext.sql(sql)

  185. df.createOrReplaceTempView(leveltable)

  186. }

  187. def genarateFullTable(sqlContext: HiveContext): Unit = {

  188. val sql =

  189. """

  190. select area,level,order_product_id,city_name

  191. ,clicknum,product_name,get_product_status(extends_info)

  192. from topNlevel a,myhive.product_info b

  193. where a.order_product_id = b.product_id

  194.  
  195. """

  196. val df = sqlContext.sql(sql)

  197. df.createOrReplaceTempView("result")

  198. val url = "jdbc:mysql://hadoop02:3306/test"

  199. val table = "resultFull"

  200. val properties = new Properties()

  201. properties.put("user","root")

  202. properties.put("password","root")

  203. df.write.mode(SaveMode.Overwrite).jdbc(url,table,properties)

  204. }

  205.  
  206. }

求区域商品对应的城市信息需要使用自定义udaf 函数

 
  1. package day6.test1.day9

  2.  
  3. import org.apache.spark.sql.Row

  4. import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}

  5. import org.apache.spark.sql.types.{DataType, StringType, StructField, StructType}

  6.  
  7. object MyUDAT_cityCombain extends UserDefinedAggregateFunction{

  8. override def inputSchema: StructType = StructType(

  9. StructField("city_info",StringType,true)::Nil

  10. )

  11.  
  12. override def bufferSchema: StructType = StructType(

  13. StructField("city_info",StringType,true)::Nil

  14. )

  15.  
  16. override def dataType: DataType = StringType

  17.  
  18. override def deterministic: Boolean = true

  19.  
  20. override def initialize(buffer: MutableAggregationBuffer): Unit = {

  21. buffer.update(0,"")

  22. }

  23.  
  24. override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {

  25. var last_str = buffer.getString(0)

  26. val current_str = input.getString(0)

  27. if(!last_str.contains(current_str)){

  28. if (last_str.equals("")){

  29. last_str = current_str

  30. }else{

  31. last_str += "," + current_str

  32. }

  33. }

  34. buffer.update(0,last_str)

  35. }

  36.  
  37. override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {

  38. var b1 = buffer1.getString(0)

  39. val b2 = buffer2.getString(0)

  40. for(s <- b2.split(",")){

  41. if (!b1.contains(s)){

  42. if (b1.equals("")){

  43. b1 = s

  44. }else{

  45. b1 += "," + s

  46. }

  47. }

  48. }

  49. buffer1.update(0,b1)

  50. }

  51.  
  52. override def evaluate(buffer: Row): Any = {

  53. buffer.getString(0)

  54. }

  55. }

-----------------------------------------------------------------------------------------------------------

程序运行,提前将表存储到对应的数据库中

将程序打包,详细图解过程如下:

-------------------------------------------------------

将打包好的jar包上传到spark集群,使用命令运行

 spark-submit --class day6.test1.day9.HotGoodsMT --master local[4] meituan.jar 

最终结果保存在MySQL中,结果如下:

 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值