Spark Core &Spark SQL & Spark DSL示例

Spark Core &Spark SQL & Spark DSL示例

数据样例

以下为json样例,数据文件为:“JsonTest02.json”

{"openid":"opEu45VAwuzCsDr6iGIf4qhnUZUI","phoneNum":"18334832972","money":"30","date":"2018-09-13T02:15:16.054Z","lat":39.688011,"log":116.066689,"province":"四川省","city":"成都市","district":"房山区","terminal":"ios","status":"1"}
{"openid":"opEu45VAwuzCsDr6iGIf4qhnUZUI","phoneNum":"15101592939","money":"50","date":"2018-09-13T02:15:16.054Z","lat":39.688011,"log":116.066689,"province":"山西省","city":"大同市","district":"房山区","terminal":"Android","status":"0"}
{"openid":"opEu45VAwuzCsDr6iGIf4qhnUZUI","phoneNum":"15101599139","money":"30","date":"2018-09-13T02:15:16.054Z","lat":39.688011,"log":116.066689,"province":"山西省","city":"大同市","district":"房山区","terminal":"windows","status":"1"}

需求说明

  • 用SparkCore或SparkSQL实现均可,需求如下:
      1. 统计每个用户充值总金额并降序排序
      1. 统计所有系统类型登录总次数并降序排序
      1. 统计各省的每个用户登录次数的Top3
    • 部分字段:
    • phoneNum:手机号(用户账号)
    • terminal:系统类型
    • province:省份
    • money:充值金额
    • status:充值状态(成功为 ‘1’ 失败为 ‘0’)

代码

import org.apache.avro.data.Json
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.expressions.Window
import org.apache.spark.rdd.RDD
import com.alibaba.fastjson
import com.alibaba.fastjson.{JSON, JSONArray, JSONObject}

object Jsonread {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("sparkcore").setMaster("local[*]")
    val sc = new SparkContext(conf)
    val textfile: RDD[String] = sc.textFile("in/JsonTest02.json")

    println("****** SPARKCORE******")

    println("****** 1.统计每个用户充值总金额并降序排序******")
    val file: RDD[String] =textfile.filter(x => {
      val jsonObject: JSONObject = JSON.parseObject(x)
      val status = jsonObject.get("status").toString
      status == "1"
    }).filter(x => {
      val jsonObject: JSONObject = JSON.parseObject(x)
      val money = jsonObject.containsKey("money")
      val phoneNum = jsonObject.containsKey("phoneNum")
      true
    })
    file.cache()
    //file.foreach(println)

// 记录一个互换位置的做法,刚刚不知道sortby其实是可以_._2出来的,一直互换位置,太傻了
// map(x=>{
// (x._2,x._1)

    val rddOne= file.map(x=>{
      val jsonObject = JSON.parseObject(x)
      val phoneNum = jsonObject.getString("phoneNum")
      val money = jsonObject.getIntValue("money")
      (phoneNum,money)
    }).reduceByKey(_ + _).sortBy(_._2,false).foreach(println)

    println("****** 2.统计所有系统类型登录总次数并降序排序******")
    //感觉数据有点问题,就没有用集合来存了,core输出和sql一样
    val rddTwo =
    textfile.map{ x =>
      val jSONObject = JSON.parseObject(x)
      (jSONObject.getString("terminal"),1)
    }.reduceByKey(_+_).sortBy(_._2,false).foreach(println)

    println("****** 3.统计各省的每个用户登录次数的Top3******")
	//就是生成了一张表,没在案例使用
	val rowrdd = textfile.map( m => {
    val jsonObject = JSON.parseObject(m)
    Row(jsonObject.getString("province"),jsonObject.getString("phoneNum"),1)
    }
    )
    //生成kv元组,按K分组
    val groupRdd = textfile.map(m => {
      val jsonObject = JSON.parseObject(m)
      val provinceName = jsonObject.getString("province")
      val phoneNumName = jsonObject.getString("phoneNum")
      //暂时拼接成kv
      (provinceName+","+phoneNumName,1)
    }).reduceByKey(_+_).map(m=>{
    	//分个片重新分配kv结构
      val provinceName = m._1.split(",")(0)
      val phoneNumName = m._1.split(",")(1)
      val top = m._2
      (provinceName,(phoneNumName,top.toInt))
    }).groupByKey.map( m => {
      val provinceName = m._1
      val top3 = m._2.toArray.sortWith(_._2>_._2).take(3)
      (provinceName,top3)
    }).foreach(m=>{
      println(m._1 + "前三名为:")
      m._2.foreach(x => {
        println(x)
      })
    })


    println("==================================华丽分割线===========================================")

    val spark = SparkSession.builder.master("local[*]").appName("Jsonread").getOrCreate()
    val frame: DataFrame = spark.read.json("in/JsonTest02.json")
    val tmp = frame.createOrReplaceTempView("tmp")

    println("****** SPARK SQL******")
    println("****** 1.统计每个用户充值总金额并降序排序******")
    spark.sql("select phoneNum,sum(money) from tmp where status = 1 group by phoneNum order by sum(money)").show

    println("****** 2.统计所有系统类型登录总次数并降序排序******")
    spark.sql(
      "select terminal,count(openid) " +
                "from tmp group by terminal" +
              " order by count(openid) desc ").show

    println("****** 3.统计各省的每个用户登录次数的Top3******")

    spark.sql(
      "select test.province,test.phoneNum " +
              "from(" +
              "select  " +
              "tmp.province," +
              "tmp.phoneNum," +
              "row_number()over(partition by tmp.province order by tmp.copenid desc) As rn " +
              "from (select province,phoneNum,count(openid) As copenid " +
              "from tmp " +
              "group by province,phoneNum) tmp" +
              ") test " +
              "where test.rn < 4").show()

    println("****** SPARK DSL SQL******")
    import spark.implicits._
    println("****** 1.统计每个用户充值总金额并降序排序******")
    frame.filter($"status" === 1).groupBy($"phoneNum").agg(sum($"money").as("总金额")).orderBy($"总金额".desc).show()
    
    println("****** 2.统计所有系统类型登录总次数并降序排序******")
    frame.groupBy($"terminal").agg(count($"openid").as("总次数")).orderBy($"总次数".desc).show()
    
    println("****** 3.统计各省的每个用户登录次数的Top3******")
    val w = Window.partitionBy("province").orderBy($"总次数".desc)
    val endFrame = frame.groupBy($"province",$"phoneNum").agg(count($"openid").as("总次数")).
      select($"province",$"phoneNum",row_number().over(w).as("rn")).where($"rn".leq(3)).show()

  }
}

结果:
SparkCore输出就是没有Sql展示的看起来舒服~Spark算子还不是很熟,有错误欢迎指出,如SparkCore如果有更好的方案,可以指点下
在这里插入图片描述

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值