Scala_Spark-电商平台离线分析项目-需求六各区域top3商品统计

最新推荐文章于 2020-07-07 09:14:19 发布

大数据小阿姨

最新推荐文章于 2020-07-07 09:14:19 发布

阅读量446

点赞数

分类专栏： scala spark 文章标签： scala sparksql 大数据项目

本文链接：https://blog.csdn.net/weixin_44345917/article/details/102952726

版权

spark 同时被 2 个专栏收录

25 篇文章 3 订阅

订阅专栏

scala

14 篇文章 0 订阅

订阅专栏

Scala_Spark-电商平台离线分析项目-需求六各区域top3商品统计

AreaTop3Stat.scala

import java.util.UUID

import commons.conf.ConfigurationManager
import commons.constant.Constants
import commons.utils.ParamUtils
import net.sf.json.JSONObject
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{SaveMode, SparkSession}

/**
 *
 * 需求六：各区域top3商品统计
 * 用户访问行为表
 * 商品信息表
 * 加一个城市信息
 *
 */
object AreaTop3Stat {



  def main(args: Array[String]): Unit = {
    // 获取限制条件
    val jsonStr = ConfigurationManager.config.getString(Constants.TASK_PARAMS)
    val taskParam = JSONObject.fromObject(jsonStr)

    // 创建全局唯一主键
    val taskUUID = UUID.randomUUID().toString

    val sparkConf = new SparkConf().setAppName("area").setMaster("local[*]")
    val sparkSession = SparkSession.builder().config(sparkConf).enableHiveSupport().getOrCreate()


    // 1
    // todo 第一步 获得用户指定日期范围内的点击行为
    // RDD[(cityId,pid)]
    val cityId2PidRDD = getCityAndProductInfo(sparkSession,taskParam)
        // 打印测试一下sql对不对
        // cityId2PidRDD.foreach(println(_))
        /*(cityid,productid)
        (5,47)
        (3,75)
        */

    // 2
    // todo 第二步 获得城市信息
    // RDD[(sityId,CityAreaInfo)]
    val cityId2AreaInfoRDD = getCityAreaInfo(sparkSession)

    // 3
    // todo 第三步 （一）基础信息临时表
    // tmp_area_basic_info: 表中的一条数据就代表一次点击商品的行为
    getAreaPidBasicInfoTable(sparkSession,cityId2PidRDD,cityId2AreaInfoRDD)
        // 测试 输出一下这张临时表
        // sparkSession.sql("select * from tmp_area_basic_info").show()
        /*+-------+---------+----+---+
         |city_id|city_name|area|pid|
          +-------+---------+----+---+
          |      0|       北京|  华北| 58|*/


    // todo 第五步 注册自定义函数
    // 注册UDF 基础表中数据拼接 tmp_area_basic_info 把city_name和city_id拼接在一起
    sparkSession.udf.register("concat_long_string",(v1:Long,v2:String,split:String)=>{
      v1 +split +v2
    })
    // 注册UDAF 用户自定义聚合函数
    sparkSession.udf.register("group_concat_distinct",new GroupConcatDistinct)

    // 4
    // todo 第四步 （二）包含完整商品信息的各区域商品点击次数临时表
    // tmp_area_click_count
    getAreaProductClickCountTable(sparkSession) //因为数据在上一张表里面 所以这里不需要传参数
        // 测试 输出一下这张临时表
        // sparkSession.sql("select * from tmp_area_click_count").show()
        /*+----+---+--------+ 这张不完整
        |area|pid|click_count|
        +----+---+--------+
        |  西南| 70|      17|
        |  西北| 43|      12|
          但是我想要不丢失每个area对应的city相关数据
          group by后面又不能加上这个字段
          所以实现一个UDF函数和UDAF，注册后修改锚点4的方法的sql语句
          所以有了第五步

          所以完整的 tmp_area_click_count 包含完整商品信息的各区域商品点击次数临时表
          +----+---+-----------+----------+
          |area|pid|click_count|city_infos|
          +----+---+-----------+----------+
          |  西南| 62|         15|      8:成都|
          |  华东| 73|         30| 1:上海,2:南京|
         */

    // 5
    // todo 第六步 完成两张表的聚合
    // 先注册自定义函数  product_info表中extend_info字段是json字串 需单独处理
    // 从json串获得指定key的value的字符串
    sparkSession.udf.register("get_json_field",(json:String,field:String)=>{
      val jsonObject = JSONObject.fromObject(json)
      jsonObject.getString(field)
    })
    // tmp_area_click_count临时表 + product_info表
    getAreaPidBasicInfoInfo(sparkSession)
        // 测试输出一下
        // sparkSession.sql("select * from tmp_area_count_product_info").show()
        /* tmp_area_count_product_info 还不够人性化
        +----+----------+---+------------+--------------+-----------+
        |area|city_infos|pid|product_name|product_status|click_count|
        +----+----------+---+------------+--------------+-----------+
        |  西北|      7:西安| 43|   product43|             1|         15|
        |  西南|      8:成都| 70|   product70|             1|         30|
        进一步把状态从 0/1 改成 self/third_party
         +----+----------+---+------------+--------------+-----------+
        |area|city_infos|pid|product_name|product_status|click_count|
        +----+----------+---+------------+--------------+-----------+
        |  西南|      8:成都| 62|   product62|          Self|         15|
        |  华东| 1:上海,2:南京| 73|   product73|   Third Party|         30|
         */

    // 6
    // todo 第七步 获取区域Top3商品统计 写入数据库
    getTop3Product(sparkSession,taskUUID)
        // 测试
        // sparkSession.sql("select * from temp_test").show()
        /* 数据准确 地区top3商品
        +----+----------+----------+---+------------+--------------+-----------+
        |area|area_level|city_infos|pid|product_name|product_status|click_count|
        +----+----------+----------+---+------------+--------------+-----------+
        |  华东|   A_Level| 1:上海,2:南京| 27|   product27|          Self|         40|
        |  华东|   A_Level| 1:上海,2:南京| 36|   product36|   Third Party|         39|
        |  华东|   A_Level| 1:上海,2:南京| 33|   product33|   Third Party|         38|
        |  西北|   C_Level|      7:西安| 28|   product28|          Self|         24|
        |  西北|   C_Level|      7:西安| 47|   product47|   Third Party|         24|
        |  西北|   C_Level|      7:西安| 32|   product32|          Self|         23|
         */


  }


  /**
   * 锚点6的方法
   * 获取区域top3商品，写入数据库
   *
   * 用到了sql的开窗函数（排序）
   * row_number() over(PARTITION BY area ORDER BY click_count DESC) rank
   *
   * 和case when语句（分等级）
   *
   * @param sparkSession
   * @param taskUUID
   */
  def getTop3Product(sparkSession: SparkSession, taskUUID: String) = {
    /*val sql = "select area,city_infos,pid,product_name,product_status,click_count," +
      "row_number() over(PARTITION BY area ORDER BY click_count DESC) rank " +
      "from tmp_area_count_product_info"
    sparkSession.sql(sql).createOrReplaceTempView("temp_test")*/
    /* 测试结果如下
    |area|city_infos|pid|product_name|product_status|click_count|rank|
    +----+----------+---+------------+--------------+-----------+----+
    |  华东| 1:上海,2:南京| 27|   product27|          Self|         40|   1|
    |  华东| 1:上海,2:南京| 36|   product36|   Third Party|         39|   2|
    |  华东| 1:上海,2:南京| 33|   product33|   Third Party|         38|   3|
    |  华东| 1:上海,2:南京|  6|    product6|   Third Party|         37|   4|
    |  华东| 1:上海,2:南京| 18|   product18|          Self|         36|   5|
    |  华东| 1:上海,2:南京|  9|    product9|   Third Party|         36|   6|
     */

    val sql = "select area, " +
      "CASE " +
      "WHEN area='华北' OR area='华东' THEN 'A_Level' " +
      "WHEN area='华中' OR area='华南' THEN 'B_Level' " +
      "WHEN area='西南' OR area='西北' THEN 'C_Level' " +
      "ELSE 'D_Level' " +
      "END area_level, " +
      "city_infos,pid,product_name,product_status,click_count from " +
      "(select area,city_infos,pid,product_name,product_status,click_count," +
      "row_number() over(PARTITION BY area ORDER BY click_count DESC) rank " +
      "from tmp_area_count_product_info) t " +
      "where rank<=3"

    // 测试
    // sparkSession.sql(sql).createOrReplaceTempView("temp_test")
    /*
    +----+----------+----------+---+------------+--------------+-----------+
    |area|area_level|city_infos|pid|product_name|product_status|click_count|
    +----+----------+----------+---+------------+--------------+-----------+
    |  华东|   A_Level| 1:上海,2:南京| 27|   product27|          Self|         40|
    |  华东|   A_Level| 1:上海,2:南京| 36|   product36|   Third Party|         39|
    |  华东|   A_Level| 1:上海,2:南京| 33|   product33|   Third Party|         38|
    |  西北|   C_Level|      7:西安| 28|   product28|          Self|         24|
    |  西北|   C_Level|      7:西安| 47|   product47|   Third Party|         24|
    |  西北|   C_Level|      7:西安| 32|   product32|          Self|         23|
     */

    // todo 写入数据库
    // 用sparksql来做 需要这步
     val top3ProductRDD = sparkSession.sql(sql).rdd.map{
      case row => {
        AreaTopProduct(taskUUID,
          row.getAs[String]("area"),
          row.getAs[String]("area_level"),
          row.getAs[Long]("pid"),
          row.getAs[String]("city_infos"),
          row.getAs[Long]("click_count"),
          row.getAs[String]("product_name"),
          row.getAs[String]("product_status"))
      }
    }

    import sparkSession.implicits._
    top3ProductRDD.toDF().write
      .format("jdbc")
      .option("url",ConfigurationManager.config.getString(Constants.JDBC_URL))
      .option("user",ConfigurationManager.config.getString(Constants.JDBC_USER))
      .option("password",ConfigurationManager.config.getString(Constants.JDBC_PASSWORD))
      .option("dbtable","area_top3_product")
      .mode(SaveMode.Append)
      .save()

  }



  /**
   * 锚点5的方法 拼接两张表
   * tmp_area_click_count临时表: area,city_infos,pid,click_count  ---tacc
   * product_info表: product_id,product_name,extend_info  ---pi
   *
   * "extend_info"字段: 自营还是第三方
   *
   * @param sparkSession
   */
  def getAreaPidBasicInfoInfo(sparkSession: SparkSession)={
    val sql = "select tacc.area,tacc.city_infos,tacc.pid,pi.product_name," +
      "if(get_json_field(pi.extend_info,'product_status')=1,'Self','Third Party') product_status," +  // 0/1改成self/third_party 用了sql的if(条件,true结果,false结果)语句
      "tacc.click_count " +
      "from tmp_area_click_count tacc join product_info pi on tacc.pid = pi.product_id"

    sparkSession.sql(sql).createOrReplaceTempView("tmp_area_count_product_info")
  }



  /**
   * 锚点4的方法
   * 含完整商品信息的各区域商品点击次数临时表
   * tmp_area_click_count
   *
   * 表与表之间一个sql语句就完事了
   * count(*) 因为原数据表 一条记录代表依次一次点击行为
   */
  def getAreaProductClickCountTable(sparkSession: SparkSession)={
    val sql = "select area,pid,count(*) click_count," +
      "group_concat_distinct(concat_long_string(city_id,city_name,':')) city_infos " +  //这句是自定义UDAF后加的
      "from tmp_area_basic_info group by area,pid"
    sparkSession.sql(sql).createOrReplaceTempView("tmp_area_click_count")

  }



  /**
   * 锚点3的方法
   * join
   * tmp_area_basic_info 基础信息临时表
   *
   * @param sparkSession
   * @param cityId2PidRDD
   * @param cityId2AreaInfoRDD
   */
  def getAreaPidBasicInfoTable(sparkSession: SparkSession,
                               cityId2PidRDD: RDD[(Long,Long)],
                               cityId2AreaInfoRDD: RDD[(Long, CityAreaInfo)])={
    val areaPidInfoRDD = cityId2PidRDD.join(cityId2AreaInfoRDD).map{
      case (cityId,(pid,areaInfo)) =>
        (cityId,areaInfo.city_name,areaInfo.area,pid) // 四个字段
    }

    import sparkSession.implicits._
    areaPidInfoRDD
      .toDF("city_id","city_name","area","pid") //需要指定字段名
      .createOrReplaceTempView("tmp_area_basic_info")



  }



  /**
   * 锚点2 的方法
   * 获取区域信息
   * 变成一个RDD
   * @param sparkSession
   */
  def getCityAreaInfo(sparkSession: SparkSession)={
    val cityAreaInfoArray = Array((0L, "北京", "华北"), (1L, "上海", "华东"),
      (2L, "南京", "华东"), (3L, "广州", "华南"), (4L, "三亚", "华南"),
      (5L, "武汉", "华中"), (6L, "长沙", "华中"), (7L, "西安", "西北"),
      (8L, "成都", "西南"), (9L, "哈尔滨", "东北"))

    sparkSession.sparkContext.makeRDD(cityAreaInfoArray).map{
      case(cityId,cityName,area) => {
        (cityId,CityAreaInfo(cityId,cityName,area))
      }
    }
  }



  /**
   *  锚点1的方法
   *  获得用户指定日期范围内的点击行为
   *
   * @param sparkSession
   * @param taskParam
   * @return
   */
  def getCityAndProductInfo(sparkSession: SparkSession, taskParam: JSONObject) = {
    val startDate = ParamUtils.getParam(taskParam,Constants.PARAM_START_DATE)
    val endDate = ParamUtils.getParam(taskParam,Constants.PARAM_END_DATE)

    // 只获取发生过点击的action数据 and click_product_id != -1
    // 获取到的一条action数据就代表一个点击行为
    val sql = "select city_id,click_product_id from user_visit_action where date >='"+startDate+"' and date <='"+endDate+"' and click_product_id != -1"
    import sparkSession.implicits._
    sparkSession.sql(sql).as[CityClickProduct].rdd.map{
      case cityPid => (cityPid.city_id,cityPid.click_product_id)

    }

  }



}

/*
tmp_area_basic_info 区域临时表
    +-------+---------+----+---+
    |city_id|city_name|area|pid|
    +-------+---------+----+---+
    |      0|       北京|  华北| 58|
    |      0|       北京|  华北| 84|


完整的 tmp_area_click_count 包含完整商品信息的各区域商品点击次数临时表
    +----+---+-----------+----------+
    |area|pid|click_count|city_infos|
    +----+---+-----------+----------+
    |  西南| 62|         15|      8:成都|
    |  华东| 73|         30| 1:上海,2:南京|


优化的tmp_area_count_product_info
    +----+----------+---+------------+--------------+-----------+
    |area|city_infos|pid|product_name|product_status|click_count|
    +----+----------+---+------------+--------------+-----------+
    |  西南|      8:成都| 62|   product62|          Self|         15|
    |  华东| 1:上海,2:南京| 73|   product73|   Third Party|         30|
    

数据准确 地区top3商品
    +----+----------+----------+---+------------+--------------+-----------+
    |area|area_level|city_infos|pid|product_name|product_status|click_count|
    +----+----------+----------+---+------------+--------------+-----------+
    |  华东|   A_Level| 1:上海,2:南京| 27|   product27|          Self|         40|
    |  华东|   A_Level| 1:上海,2:南京| 36|   product36|   Third Party|         39|
    |  华东|   A_Level| 1:上海,2:南京| 33|   product33|   Third Party|         38|
    |  西北|   C_Level|      7:西安| 28|   product28|          Self|         24|
    |  西北|   C_Level|      7:西安| 47|   product47|   Third Party|         24|
    |  西北|   C_Level|      7:西安| 32|   product32|          Self|         23|




 */



/*有的东西没有标准答案*/
/*先实现再优化*/

自定义UDAF

import org.apache.spark.sql.Row
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types.{DataType, StringType, StructField, StructType}

/**
 * 自定义UDF和UDAF是用来解析和调试复杂字段的
 * AreaTopStat.scala 需求六里面用了自定义UDF
 *
 * 需求六的自定义UDAF 用户自定义聚合函数
 * 聚合cityId:cityName
 * bufferCityInfo1: cityId1:cityName1,cityId2:cityName2
 *
 *
 * 需要在 AreaTopStat.scala 里面注册使用
 *
 * // 第五步 把city_name和city_id拼接在一起
 * // 注册UDF 基础表中数据拼接 tmp_area_basic_info
 *     sparkSession.udf.register("concat_long_string",(v1:Long,v2:String,split:String)=>{
 *        v1 +split +v2
 *     })
 * // 注册UDAF 用户自定义聚合函数
 *     sparkSession.udf.register("group_concat_distinct",new GroupConcatDistinct)
 *
 *
 * // 第四步 （二）包含完整商品信息的各区域商品点击次数临时表
 * // tmp_area_click_count
 * getAreaProductClickCountTable(sparkSession) //因为数据在上一张表里面 所以这里不需要传参数
 * // 测试 输出一下这张临时表
 *          sparkSession.sql("select * from tmp_area_click_count").show()
 *            /*+----+---+--------+ 这张不完整
 *            |area|pid|click_count|
 *            +----+---+--------+
 *            |  西南| 70|      17|
 *            |  西北| 43|      12|
 *
 *            但是我想要不丢失每个area对应的city相关数据
 *            group by后面又不能加上这个字段
 *            所以实现一个UDF函数和UDAF，注册后修改锚点4的方法的sql语句
 *            所以有了第五步
 *
 *            所以最终完整的 tmp_area_click_count 包含完整商品信息的各区域商品点击次数临时表
 *            +----+---+-----------+----------+
 *            |area|pid|click_count|city_infos|
 *            +----+---+-----------+----------+
 *            |  西南| 62|         15|      8:成都|
 *            |  华东| 73|         30| 1:上海,2:南京|
 *            */
 *
 *
 * def getAreaProductClickCountTable(sparkSession: SparkSession)={
 *      val sql = "select area,pid,count(*) click_count," +
 *      "group_concat_distinct(concat_long_string(city_id,city_name,':')) city_infos " +  //这句是自定义UDAF后加的
 *      "from tmp_area_basic_info group by area,pid"
 *
 *     sparkSession.sql(sql).createOrReplaceTempView("tmp_area_click_count")
 * }
 *
 */
class GroupConcatDistinct extends UserDefinedAggregateFunction{

  // UDAF：输入数据类型为String
  override def inputSchema: StructType = StructType(StructField("cityInfo",StringType)::Nil) // ::Nil 啥意思 写上

  // 缓冲区类型
  override def bufferSchema: StructType = StructType(StructField("cityInfo",StringType)::Nil)

  // 输出类型
  override def dataType: DataType = StringType

  // true
  override def deterministic: Boolean = true

  // 初始化
  override def initialize(buffer: MutableAggregationBuffer): Unit ={
    buffer(0)=""
  }

  // 更新 重点来了
  override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
    var bufferCityInfo= buffer.getString(0) //获取buffer里第0个位置的聚合数据 并且以字符串形式取出
    val cityInfo = input.getString(0) //获取输入的最新数据

    // 做一个有去重的字符串拼接
    if(!bufferCityInfo.contains(cityInfo)) { // 判断当前这个字符串里面没有这个cityInfo的字符串
      if ("".equals(bufferCityInfo)) {  // 缓冲区是新的 它就是一个空的字符串
        bufferCityInfo += cityInfo
      } else {
        bufferCityInfo += "," + cityInfo
      }
      // 更新第0个位置的buffer
      buffer.update(0,bufferCityInfo)
    }
  }

  // 合并
  override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
    // bufferCityInfo1: cityId1:cityName1,cityId2:cityName2
    var bufferCityInfo1 = buffer1.getString(0)
    // bufferCityInfo2: cityId1:cityName1,cityId2:cityName2,cityId3:cityName3
    var bufferCityInfo2 = buffer2.getString(0)

    // 去重合并
    for(cityInfo <- bufferCityInfo2.split(",")){
      if(!bufferCityInfo1.contains(cityInfo)){
        if("".equals(bufferCityInfo1)){
          bufferCityInfo1 += cityInfo
        }else{
          bufferCityInfo1 += "," +cityInfo
        }
      }
    }

    buffer1.update(0,bufferCityInfo1)

  }

  // 统计结果
  override def evaluate(buffer: Row): Any = {
    buffer.getString(0)
  }
}


/**
 这个sql语句很特别 所以单独拿出来
 val sql = "select area,pid,count(*) click_count," +
"group_concat_distinct(concat_long_string(city_id,city_name,':')) city_infos " +  //这句是自定义UDAF后加的
"from tmp_area_basic_info group by area,pid"
 
 */

样例类


/**
 * 需求六的样例类
 * 锚点1的方法里的
 * @param city_id
 * @param click_product_id
 */
case class CityClickProduct(city_id:Long,click_product_id:Long)

/**
 * 需求六样例类
 * @param city_id
 * @param city_name
 * @param area
 */
case class CityAreaInfo(city_id:Long,city_name:String,area:String)

/**
 * 需求六样例类
 * 写入数据库的表 区域top3商品 数据形式
 *
 * @param taskid
 * @param area
 * @param areaLevel
 * @param productid
 * @param cityInfos
 * @param clickCount
 * @param productName
 * @param prodcutStatus
 */
case class AreaTopProduct(
                         taskid:String,
                         area:String,
                         areaLevel:String,
                         productid:Long,
                         cityInfos:String,
                         clickCount:Long,
                         productName:String,
                         prodcutStatus:String
                         )

依赖前面的需求也一样

 <dependencies>



        <!-- Spark的依赖引入 -->
        <dependency>
            <groupId>com.atguigu</groupId>
            <artifactId>commons</artifactId>
            <version>1.0-SNAPSHOT</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.11</artifactId>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-hive_2.11</artifactId>
        </dependency>
        <!-- 引入Scala -->
        <dependency>
            <groupId>org.scala-lang</groupId>
            <artifactId>scala-library</artifactId>
        </dependency>


    </dependencies>

    <build>
        <plugins>
            <plugin>
                <!-- scala-maven-plugin插件用于在任意的maven项目中对scala代码进行编译/测试/运行/文档化 -->
                <groupId>net.alchim31.maven</groupId>
                <artifactId>scala-maven-plugin</artifactId>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-assembly-plugin</artifactId>
                <configuration>
                    <archive>
                        <manifest>
                            <mainClass>com.atguigu.page.PageOneStepConvertRate</mainClass>
                        </manifest>
                    </archive>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
            </plugin>
        </plugins>
    </build>

大数据小阿姨

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
Scala_Spark-电商平台离线分析项目-需求六各区域top3商品统计

Scala_Spark-电商平台离线分析项目-需求六各区域top3商品统计AreaTop3Stat.scalaimport java.util.UUIDimport commons.conf.ConfigurationManagerimport commons.constant.Constantsimport commons.utils.ParamUtilsimport net.sf...
复制链接

扫一扫