package day0614Work
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.functions._
object workSQL {
def main(args: Array[String]): Unit = {
val spark: SparkSession =
SparkSession.builder()
.appName("ReadSQL")
.master("local")
.getOrCreate()
import spark.implicits._
val df: DataFrame = spark.read.textFile("D:\\tmp\\work0614\\0614\\jd_visit.log")
.map(v => {
val p: Array[String] = v.split(" ")
(p(0), p(1))
})
.toDF("user_id", "shop")
df.printSchema()
df.show(false)
val uvDF = df.groupBy("shop").agg(countDistinct("user_id").alias("uv"))
uvDF.show(false)
val windowSpec = Window.partitionBy("shop").orderBy(desc("visit_count"), asc("user_id"))
val rankedDF = df.groupBy("shop", "user_id")
.agg(count("shop").alias("visit_count"))
.withColumn("rank", rank().over(windowSpec))
val top3DF = rankedDF.where(col("rank") <= 3)
top3DF.show(false)
}
}
root
|-- user_id: string (nullable = true)
|-- shop: string (nullable = true)
+-------+----+
|user_id|shop|
+-------+----+
|u1 |a |
|u2 |b |
|u1 |b |
|u1 |a |
|u3 |c |
|u4 |b |
|u1 |a |
|u2 |c |
|u5 |b |
|u4 |b |
|u6 |c |
|u2 |c |
|u1 |b |
|u2 |a |
|u2 |a |
|u3 |a |
|u5 |a |
+-------+----+
+----+---+
|shop|uv |
+----+---+
|c |3 |
|b |4 |
|a |4 |
+----+---+
23/06/15 09:24:22 INFO CodeGenerator: Code generated in 16.2444 ms
+----+-------+-----------+----+
|shop|user_id|visit_count|rank|
+----+-------+-----------+----+
|a |u1 |3 |1 |
|a |u2 |2 |2 |
|a |u3 |1 |3 |
|b |u1 |2 |1 |
|b |u4 |2 |2 |
|b |u2 |1 |3 |
|c |u2 |2 |1 |
|c |u3 |1 |2 |
|c |u6 |1 |3 |
+----+-------+-----------+----+