数据样例
以下为json样例,数据文件为:“JsonTest02.json”
{"openid":"opEu45VAwuzCsDr6iGIf4qhnUZUI","phoneNum":"18334832972","money":"30","date":"2018-09-13T02:15:16.054Z","lat":39.688011,"log":116.066689,"province":"四川省","city":"成都市","district":"房山区","terminal":"ios","status":"1"}
{"openid":"opEu45VAwuzCsDr6iGIf4qhnUZUI","phoneNum":"15101592939","money":"50","date":"2018-09-13T02:15:16.054Z","lat":39.688011,"log":116.066689,"province":"山西省","city":"大同市","district":"房山区","terminal":"Android","status":"0"}
{"openid":"opEu45VAwuzCsDr6iGIf4qhnUZUI","phoneNum":"15101599139","money":"30","date":"2018-09-13T02:15:16.054Z","lat":39.688011,"log":116.066689,"province":"山西省","city":"大同市","district":"房山区","terminal":"windows","status":"1"}
需求说明
- 用SparkCore或SparkSQL实现均可,需求如下:
-
- 统计每个用户充值总金额并降序排序
-
- 统计所有系统类型登录总次数并降序排序
-
- 统计各省的每个用户登录次数的Top3
- 部分字段:
- phoneNum:手机号(用户账号)
- terminal:系统类型
- province:省份
- money:充值金额
- status:充值状态(成功为 ‘1’ 失败为 ‘0’)
-
代码
import org.apache.avro.data.Json
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.expressions.Window
import org.apache.spark.rdd.RDD
import com.alibaba.fastjson
import com.alibaba.fastjson.{JSON, JSONArray, JSONObject}
object Jsonread {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("sparkcore").setMaster("local[*]")
val sc = new SparkContext(conf)
val textfile: RDD[String] = sc.textFile("in/JsonTest02.json")
println("****** SPARKCORE******")
println("****** 1.统计每个用户充值总金额并降序排序******")
val file: RDD[String] =textfile.filter(x => {
val jsonObject: JSONObject = JSON.parseObject(x)
val status = jsonObject.get("status").toString
status == "1"
}).filter(x => {
val jsonObject: JSONObject = JSON.parseObject(x)
val money = jsonObject.containsKey("money")
val phoneNum = jsonObject.containsKey("phoneNum")
true
})
file.cache()
//file.foreach(println)
// 记录一个互换位置的做法,刚刚不知道sortby其实是可以_._2出来的,一直互换位置,太傻了
// map(x=>{
// (x._2,x._1)
val rddOne= file.map(x=>{
val jsonObject = JSON.parseObject(x)
val phoneNum = jsonObject.getString("phoneNum")
val money = jsonObject.getIntValue("money")
(phoneNum,money)
}).reduceByKey(_ + _).sortBy(_._2,false).foreach(println)
println("****** 2.统计所有系统类型登录总次数并降序排序******")
//感觉数据有点问题,就没有用集合来存了,core输出和sql一样
val rddTwo =
textfile.map{ x =>
val jSONObject = JSON.parseObject(x)
(jSONObject.getString("terminal"),1)
}.reduceByKey(_+_).sortBy(_._2,false).foreach(println)
println("****** 3.统计各省的每个用户登录次数的Top3******")
//就是生成了一张表,没在案例使用
val rowrdd = textfile.map( m => {
val jsonObject = JSON.parseObject(m)
Row(jsonObject.getString("province"),jsonObject.getString("phoneNum"),1)
}
)
//生成kv元组,按K分组
val groupRdd = textfile.map(m => {
val jsonObject = JSON.parseObject(m)
val provinceName = jsonObject.getString("province")
val phoneNumName = jsonObject.getString("phoneNum")
//暂时拼接成kv
(provinceName+","+phoneNumName,1)
}).reduceByKey(_+_).map(m=>{
//分个片重新分配kv结构
val provinceName = m._1.split(",")(0)
val phoneNumName = m._1.split(",")(1)
val top = m._2
(provinceName,(phoneNumName,top.toInt))
}).groupByKey.map( m => {
val provinceName = m._1
val top3 = m._2.toArray.sortWith(_._2>_._2).take(3)
(provinceName,top3)
}).foreach(m=>{
println(m._1 + "前三名为:")
m._2.foreach(x => {
println(x)
})
})
println("==================================华丽分割线===========================================")
val spark = SparkSession.builder.master("local[*]").appName("Jsonread").getOrCreate()
val frame: DataFrame = spark.read.json("in/JsonTest02.json")
val tmp = frame.createOrReplaceTempView("tmp")
println("****** SPARK SQL******")
println("****** 1.统计每个用户充值总金额并降序排序******")
spark.sql("select phoneNum,sum(money) from tmp where status = 1 group by phoneNum order by sum(money)").show
println("****** 2.统计所有系统类型登录总次数并降序排序******")
spark.sql(
"select terminal,count(openid) " +
"from tmp group by terminal" +
" order by count(openid) desc ").show
println("****** 3.统计各省的每个用户登录次数的Top3******")
spark.sql(
"select test.province,test.phoneNum " +
"from(" +
"select " +
"tmp.province," +
"tmp.phoneNum," +
"row_number()over(partition by tmp.province order by tmp.copenid desc) As rn " +
"from (select province,phoneNum,count(openid) As copenid " +
"from tmp " +
"group by province,phoneNum) tmp" +
") test " +
"where test.rn < 4").show()
println("****** SPARK DSL SQL******")
import spark.implicits._
println("****** 1.统计每个用户充值总金额并降序排序******")
frame.filter($"status" === 1).groupBy($"phoneNum").agg(sum($"money").as("总金额")).orderBy($"总金额".desc).show()
println("****** 2.统计所有系统类型登录总次数并降序排序******")
frame.groupBy($"terminal").agg(count($"openid").as("总次数")).orderBy($"总次数".desc).show()
println("****** 3.统计各省的每个用户登录次数的Top3******")
val w = Window.partitionBy("province").orderBy($"总次数".desc)
val endFrame = frame.groupBy($"province",$"phoneNum").agg(count($"openid").as("总次数")).
select($"province",$"phoneNum",row_number().over(w).as("rn")).where($"rn".leq(3)).show()
}
}
结果:
SparkCore输出就是没有Sql展示的看起来舒服~Spark算子还不是很熟,有错误欢迎指出,如SparkCore如果有更好的方案,可以指点下