找出状态变化的行
import org.apache.spark.sql.SparkSession
/**
* 案例: 找出变化的行
* 开窗函数 + 表的自关联实现
*/
object FindChangeInfos {
def main(args: Array[String]): Unit = {
val session = SparkSession.builder().master("local").appName("test").getOrCreate()
session.sparkContext.setLogLevel("Error")
val frame = session.read.option("header", true).csv("T:/code/spark_scala/data/spark/test.csv")
frame.createTempView("temp")
session.sql(
"""
|select id,change,name,row_number() over(partition by id order by name ) as rank
|from temp
""".stripMargin).createTempView("t")
session.sql(
"""
| select a.id,a.change,a.name
| from t a join t b on a.id = b.id
| where a.change != b.change and a.rank = b.rank-1
""".stripMargin).show()
}
}
原始数据:
ID,CHANGE,NAME
id1,1,a
id1,1,b
id1,1,c
id1,2,d
id1,2,e
id1,1,f
id2,2,g
id2,2,h
id2,1,i
id2,1,j
id2,2,k
id3,1,l
id3,1,m
id3,2,n
id3,3,o
id3,4,p
经典案例分析_不同场景统计用户访问网站总时长
数据:
111 2019-06-20 1
111 2019-06-21 2
111 2019-06-22 3
222 2019-06-20 4
222 2019-06-21 5
222 2019-06-22 6
333 2019-06-20 7
333 2019-06-21 8
333 2019-06-22 9
444 2019-06-23 10
代码示例:
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
/**
* +---+----------+---+
* |uid| accDate|dur|
* +---+----------+---+
* |111|2019-06-20| 1|
* |111|2019-06-21| 2|
* |111|2019-06-22| 3|
* |222|2019-06-20| 4|
* |222|2019-06-21| 5|
* |222|2019-06-22| 6|
* |333|2019-06-20| 7|
* |333|2019-06-21| 8|
* |333|2019-06-22| 9|
* |444|2019-06-23| 10|
* +---+----------+---+
* 根据用户访问网站的浏览时长统计以下信息:
* 1.统计每个用户每天访问网站的总时长(当天总时长是累加之前日期的)
*
* 2.统计每个用户当前天及前一天访问网站总时长
*
* 3.统计每个用户当前天访问的网站时长(当前天统计的时长除了当前天访问的总时长还包含前一天和后一天的访问总时长)
*
* 4.统计每个用户访问网站的总时长
*/
object AccInfo {
def main(args: Array[String]): Unit = {
val session = SparkSession
.builder()
.master("local")
.appName("AccInfo")
.getOrCreate()
val sc = session.sparkContext
sc.setLogLevel("Error")
val infos = sc.textFile("T:/code/spark_scala/data/spark/userAccInfo.txt")
// session.read.json("")
val accInfoRDD: RDD[AccInfo] = infos.map(info => {
val arr = info.split("\t")
val uid = arr(0)
val accDate = arr(1)
val dur = arr(2).toInt
AccInfo(uid, accDate, dur)
})
import session.implicits._
val frame = accInfoRDD.toDF()
frame.createTempView("accInfo")
session.sql(
"""
| select uid,accDate,sum(dur) over(partition by uid order by accDate) as current_day_dur
| from accInfo
""".stripMargin).show()
session.sql(
"""
| select uid,accDate,sum(dur) over(partition by uid order by accDate rows between 1 preceding and current row ) as totalDur
| from accInfo
""".stripMargin).show()
session.sql(
"""
| select uid,accDate,sum(dur) over(partition by uid order by accDate rows between 1 preceding and 1 following) as totalDur
| from accInfo
""".stripMargin).show()
session.sql(
"""
| select uid,sum(dur) as totaldur
| from accInfo
| group by uid
""".stripMargin).show()
session.sql(
"""
| select uid,accdate,sum(dur) over(partition by uid ) as totaldur
| from accInfo
""".stripMargin).show()
}
}
case class AccInfo(uid: String, accDate: String, dur: Int)
经典案例分析_读取JsonArray数据
示例代码:
原始数据:
{"name":"zhangsan","age":18,"scores":[{"xueqi":1,"yuwen":98,"shuxue":90,"yingyu":100},{"xueqi":2,"yuwen":98,"shuxue":78,"yingyu":100}]}
{"name":"lisi","age":19,"scores":[{"xueqi":1,"yuwen":58,"shuxue":50,"yingyu":78},{"xueqi":2,"yuwen":56,"shuxue":76,"yingyu":13}]}
{"name":"wangwu","age":17,"scores":[{"xueqi":1,"yuwen":18,"shuxue":90,"yingyu":45},{"xueqi":2,"yuwen":76,"shuxue":42,"yingyu":45}]}
{"name":"zhaoliu","age":20,"scores":[{"xueqi":1,"yuwen":68,"shuxue":23,"yingyu":63},{"xueqi":2,"yuwen":23,"shuxue":45,"yingyu":87}]}
{"name":"tianqi","age":22,"scores":[{"xueqi":1,"yuwen":88,"shuxue":91,"yingyu":41},{"xueqi":2,"yuwen":56,"shuxue":79,"yingyu":45}]}
import org.apache.spark.sql.SparkSession
/**
* 读取分析json格式的Array :
* explode(集合) : 一对多将集合中数据转换成一行行的数据
*/
object ReadJsonArrayData {
def main(args: Array[String]): Unit = {
val session = SparkSession
.builder()
.appName("test")
.master("local")
.getOrCreate()
val frame = session.read.json("T:/code/spark_scala/data/spark/jsonArrayFile")
import session.implicits._
import org.apache.spark.sql.functions._
val df1 = frame.select(col("name"), frame.col("age"), explode(frame.col("scores")).as("el"))
df1.select($"name",
$"age",
col("el.xueqi"),
col("el.yuwen"),
col("el.shuxue"),
col("el.yingyu"))
.show()
// frame.createTempView("temp")
// val df = session.sql(
// """
// | select
// | name,age ,el.xueqi,el.yuwen,el.shuxue,el.yingyu
// | from
// | (select name,age,explode(scores) as el from temp ) t
// """.stripMargin).show()
}
}
经典案例分析_读取Json字符串直接获取属性对应的值
import org.apache.spark.sql.SparkSession
/**
* 读取json格式的字符串直接解析出json中属性对应的值: get_json_object(jsonstr,json属性)
*/
object ReadJsonStringExp {
def main(args: Array[String]): Unit = {
val session = SparkSession.builder().master("local").appName("test").getOrCreate()
val jsonList = List[String](
"{\"name\":\"zhangsan\",\"age\":18}",
"{\"name\":\"lisi\",\"age\":19}",
"{\"name\":\"wangwu\",\"age\":20}",
"{\"name\":\"maliu\",\"age\":21}"
)
import session.implicits._
val df = jsonList.toDF("info")
df.createTempView("t")
session.sql(
"""
| select info ,get_json_object(info,"$.name") as name ,get_json_object(info,"$.age") as age
| from t
""".stripMargin).show(false)
// df.select($"info",get_json_object($"info","$.name").as("name"),get_json_object($"info","$.age").as("age"))
// .show(false)
}
}