样例列:
case class Log(userIp:String,UserId:Int,url:String,access:String)
val conf = new SparkConf().setAppName(this.getClass.getName).setMaster("local[*]")
val spark = SparkSession.builder().config(conf).getOrCreate()
val sc = spark.sparkContext
sc.setCheckpointDir("checkpoint")
val lineRDD = sc.textFile("C:\\SchoolJavaCode\\06月-java\\spark_12_zy\\visitData.txt")
.map(_.split(","))
val logRDD = lineRDD.map(x => Log(x(0), x(1).toInt, x(2), x(3)))
import spark.implicits._
val logDF: DataFrame = logRDD.toDF()
// logDF.show()
logDF.registerTempTable("log")
logDF.createOrReplaceTempView("log")
//(5) 使用sparkSQL计算出每个url的用户访问量(uv),排序后打印控制台
// println("使用sparkSQL计算出每个url的用户访问量(uv),排序后打印控制台")
// spark.sql("select distinct url, count( distinct userId) as count from log group by url order by count asc;").show(30,false)
// //(6) 使用sparkSQL计算出每个url每天的用户访问量(uv),并打印控制台
// println("使用sparkSQL计算出每个url每天的用户访问量(uv),并打印控制台")
// spark.sql("select distinct left(access,10) as date,count( distinct userId), url from log group by url,date").show(30,false)
//(7) 使用sparkSQL计算出每个url的浏览量(pv),倒序排序并打印控制台
// println("使用sparkSQL计算出每个url的浏览量(pv),倒序排序并打印控制台")
// spark.sql("select url,count(1) as count from log group by url order by count desc;").show(30,false)
//(8) 取出浏览量(pv)前3的url及其浏览量,打印控制台
// println("取出浏览量(pv)前3的url及其浏览量,打印控制台")
// spark.sql("select url,count(1) as count from log group by url order by count desc limit 3;").show(30,false)
//(9) 使用sparkSQL计算出每个url每天的浏览量(pv),并打印控制台
// println("使用sparkSQL计算出每个url每天的浏览量(pv),并打印控制台")
// spark.sql("select left(access,10) as date,count(userId), url from log group by date,url;").show(30, false)
//(10) 使用sparkSQL统计哪天的浏览量(pv)最大,并打印控制台
println("使用sparkSQL统计哪天的浏览量(pv)最大,并打印控制台")
spark.sql("select left(access,10) as date,count(1) as count from log group by date order by count desc limit 1;").show(30,false)
//(11) 使用sparkSQL统计用户id为9的用户访问过几个网页(去重),打印控制台
println("使用sparkSQL统计用户id为9的用户访问过几个网页(去重),打印控制台")
spark.sql("select distinct url from log where userId='9' group by url;").show(30,false)
12:
spark.sql("select url,count(1) as count,concat_ws(',',collect_set(userId)) from log group by url order by count desc limit 1;").show(30,false)