代码
package com.mydemo
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
object KCDF02 {
case class Score(name: String, clazz: Int, score: Int)
def main(args: Array[String]): Unit = {
//建立sparkSession连接池
val spark = SparkSession
.builder()
.appName("SparkSQLDemo")
.master("local[1]")
.getOrCreate()
//建立sparkContext连接
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("WARN")
//配置hadoop的环境,若没有hadoop开发环境看以下教程,否则可能会在输出本地的时候没有数据,报错空指针
System.setProperty("hadoop.home.dir","C:\\Users\\myuser\\winutils\\")
//文件路径
val fileRDD: RDD[String] = sc.textFile("C:\\Users\\myuser\\Desktop\\3.txt")
//转换为RDD[]
val rowRDD: RDD[Score] = fileRDD.map(_.split(" ")).map(line => Score(line(0), line(1).toInt, line(2).toInt))
//隐式转换
import spark.implicits._
//转换为DF
val scoreDF: DataFrame = rowRDD.toDF( "name", "class", "score")
//虚拟表
scoreDF.createOrReplaceTempView("scores")
scoreDF.show()
spark.sql("select count(name) from scores").show()
//over()
spark.sql("select name,class,score,count(name) over() name_count from scores").show()
//over(partition by ..)
spark.sql("select name,class,score,count(name) over(partition by class) name_count from scores").show()
//row_number() over(order by..) rank
spark.sql("select name,class,score,row_number() over(order by score desc) rank from scores").show()
spark.sql("select name,class,score,row_number() over(partition by class order by score desc) rank from scores")
//rank() (order by ...)跳跃排序
spark.sql("select name,class,score,rank() over(order by score desc) rank from scores").show()
spark.sql("select name,class,score,rank() over(partition by class order by score desc) rank from scores").show()
//DENSE_RANK 连续排序
spark.sql("select name,class,score,dense_rank() over(order by score desc) rank from scores").show()
spark.sql("select name,class,score,dense_rank() over(partition by class order by score desc) rank from scores").show()
//ntile 分组排名
spark.sql("select name, class, score, ntile(6) over(order by score desc) rank from scores").show()
val frame: DataFrame = spark.sql("select name, class, score, ntile(6) over(partition by class order by score desc) rank from scores")
//写入数据到本地
frame.write.csv("C:\\Users\\myuser\\IdeaProjects\\spark01\\src\\main\\scala\\com\\mydemo\\csv")
frame.rdd.saveAsTextFile("C:\\Users\\myuser\\IdeaProjects\\spark01\\src\\main\\scala\\com\\mydemo\\text")
sc.stop()
spark.stop()
}
}
hadoop本地环境配置链接
链接: https://blog.csdn.net/myanddream/article/details/106192396