SparkSQL_经典案例分析_找出状态变化的行

找出状态变化的行

import org.apache.spark.sql.SparkSession

/**
 * 案例: 找出变化的行
 * 开窗函数 + 表的自关联实现
 */
object FindChangeInfos {
  def main(args: Array[String]): Unit = {
    val session = SparkSession.builder().master("local").appName("test").getOrCreate()
    session.sparkContext.setLogLevel("Error")
    val frame = session.read.option("header", true).csv("T:/code/spark_scala/data/spark/test.csv")
    frame.createTempView("temp")
    session.sql(
      """
        |select id,change,name,row_number() over(partition by id order by name ) as rank
        |from temp
      """.stripMargin).createTempView("t")

    session.sql(
      """
        | select a.id,a.change,a.name
        | from t a join t b on a.id = b.id
        | where a.change != b.change and a.rank = b.rank-1
      """.stripMargin).show()
  }
}

原始数据:

ID,CHANGE,NAME
id1,1,a
id1,1,b
id1,1,c
id1,2,d
id1,2,e
id1,1,f
id2,2,g
id2,2,h
id2,1,i
id2,1,j
id2,2,k
id3,1,l
id3,1,m
id3,2,n
id3,3,o
id3,4,p

经典案例分析_不同场景统计用户访问网站总时长

数据:

111	2019-06-20	1
111	2019-06-21	2
111	2019-06-22	3
222	2019-06-20	4
222	2019-06-21	5
222	2019-06-22	6
333	2019-06-20	7
333	2019-06-21	8
333	2019-06-22	9
444	2019-06-23	10

代码示例:

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession

/**
 * +---+----------+---+
 * |uid|   accDate|dur|
 * +---+----------+---+
 * |111|2019-06-20|  1|
 * |111|2019-06-21|  2|
 * |111|2019-06-22|  3|
 * |222|2019-06-20|  4|
 * |222|2019-06-21|  5|
 * |222|2019-06-22|  6|
 * |333|2019-06-20|  7|
 * |333|2019-06-21|  8|
 * |333|2019-06-22|  9|
 * |444|2019-06-23| 10|
 * +---+----------+---+
 * 根据用户访问网站的浏览时长统计以下信息:
 * 1.统计每个用户每天访问网站的总时长(当天总时长是累加之前日期的)
 *
 * 2.统计每个用户当前天及前一天访问网站总时长
 *
 * 3.统计每个用户当前天访问的网站时长(当前天统计的时长除了当前天访问的总时长还包含前一天和后一天的访问总时长)
 *
 * 4.统计每个用户访问网站的总时长
 */
object AccInfo {
  def main(args: Array[String]): Unit = {
    val session = SparkSession
      .builder()
      .master("local")
      .appName("AccInfo")
      .getOrCreate()

    val sc = session.sparkContext
    sc.setLogLevel("Error")
    val infos = sc.textFile("T:/code/spark_scala/data/spark/userAccInfo.txt")
//    session.read.json("")
    
    val accInfoRDD: RDD[AccInfo] = infos.map(info => {
      val arr = info.split("\t")
      val uid = arr(0)
      val accDate = arr(1)
      val dur = arr(2).toInt
      AccInfo(uid, accDate, dur)
    })
    import session.implicits._
    val frame = accInfoRDD.toDF()
    frame.createTempView("accInfo")

    session.sql(
      """
        | select uid,accDate,sum(dur) over(partition by uid order by accDate) as current_day_dur
        | from accInfo
      """.stripMargin).show()

    session.sql(
      """
        | select uid,accDate,sum(dur) over(partition by uid order by accDate rows between 1 preceding and current row ) as totalDur
        | from accInfo
      """.stripMargin).show()

    session.sql(
      """
        | select uid,accDate,sum(dur) over(partition by uid order by accDate rows between 1 preceding and 1 following) as totalDur
        | from accInfo
      """.stripMargin).show()

    session.sql(
      """
        | select uid,sum(dur) as totaldur
        | from accInfo
        | group by uid
      """.stripMargin).show()

    session.sql(
      """
        | select uid,accdate,sum(dur) over(partition by uid ) as totaldur
        | from accInfo
      """.stripMargin).show()

  }
}

case class AccInfo(uid: String, accDate: String, dur: Int)

经典案例分析_读取JsonArray数据
示例代码:


原始数据:
{"name":"zhangsan","age":18,"scores":[{"xueqi":1,"yuwen":98,"shuxue":90,"yingyu":100},{"xueqi":2,"yuwen":98,"shuxue":78,"yingyu":100}]}
{"name":"lisi","age":19,"scores":[{"xueqi":1,"yuwen":58,"shuxue":50,"yingyu":78},{"xueqi":2,"yuwen":56,"shuxue":76,"yingyu":13}]}
{"name":"wangwu","age":17,"scores":[{"xueqi":1,"yuwen":18,"shuxue":90,"yingyu":45},{"xueqi":2,"yuwen":76,"shuxue":42,"yingyu":45}]}
{"name":"zhaoliu","age":20,"scores":[{"xueqi":1,"yuwen":68,"shuxue":23,"yingyu":63},{"xueqi":2,"yuwen":23,"shuxue":45,"yingyu":87}]}
{"name":"tianqi","age":22,"scores":[{"xueqi":1,"yuwen":88,"shuxue":91,"yingyu":41},{"xueqi":2,"yuwen":56,"shuxue":79,"yingyu":45}]}
import org.apache.spark.sql.SparkSession
/**
 * 读取分析json格式的Array :
 * explode(集合) : 一对多将集合中数据转换成一行行的数据
 */
object ReadJsonArrayData {
  def main(args: Array[String]): Unit = {
    val session = SparkSession
      .builder()
      .appName("test")
      .master("local")
      .getOrCreate()
    val frame = session.read.json("T:/code/spark_scala/data/spark/jsonArrayFile")
    import session.implicits._
    import org.apache.spark.sql.functions._

    val df1 = frame.select(col("name"), frame.col("age"), explode(frame.col("scores")).as("el"))
    df1.select($"name",
      $"age",
      col("el.xueqi"),
      col("el.yuwen"),
      col("el.shuxue"),
      col("el.yingyu"))
      .show()

    //    frame.createTempView("temp")
    //    val df = session.sql(
    //      """
    //        | select
    //        |   name,age ,el.xueqi,el.yuwen,el.shuxue,el.yingyu
    //        | from
    //        | (select name,age,explode(scores) as el  from temp ) t
    //      """.stripMargin).show()
  }
}

经典案例分析_读取Json字符串直接获取属性对应的值

import org.apache.spark.sql.SparkSession

/**
 * 读取json格式的字符串直接解析出json中属性对应的值: get_json_object(jsonstr,json属性)
 */
object ReadJsonStringExp {
  def main(args: Array[String]): Unit = {
    val session = SparkSession.builder().master("local").appName("test").getOrCreate()
    val jsonList = List[String](
      "{\"name\":\"zhangsan\",\"age\":18}",
      "{\"name\":\"lisi\",\"age\":19}",
      "{\"name\":\"wangwu\",\"age\":20}",
      "{\"name\":\"maliu\",\"age\":21}"
    )

    import session.implicits._
    val df = jsonList.toDF("info")

    df.createTempView("t")
    session.sql(
      """
        | select info ,get_json_object(info,"$.name") as name ,get_json_object(info,"$.age") as age
        | from t
      """.stripMargin).show(false)


    //    df.select($"info",get_json_object($"info","$.name").as("name"),get_json_object($"info","$.age").as("age"))
    //      .show(false)
  }
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值