SparkSQL简单入门操作

代码

package com.mydemo
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}

object KCDF02 {
  case class Score(name: String, clazz: Int, score: Int)
  def main(args: Array[String]): Unit = {
  
    //建立sparkSession连接池
    val spark = SparkSession
      .builder()
      .appName("SparkSQLDemo")
      .master("local[1]")
      .getOrCreate()
    //建立sparkContext连接
    val sc: SparkContext = spark.sparkContext
    sc.setLogLevel("WARN")

    //配置hadoop的环境,若没有hadoop开发环境看以下教程,否则可能会在输出本地的时候没有数据,报错空指针
    System.setProperty("hadoop.home.dir","C:\\Users\\myuser\\winutils\\")
    //文件路径
    val fileRDD: RDD[String] = sc.textFile("C:\\Users\\myuser\\Desktop\\3.txt")
    //转换为RDD[]
    val rowRDD: RDD[Score] = fileRDD.map(_.split(" ")).map(line => Score(line(0), line(1).toInt, line(2).toInt))

	//隐式转换
    import spark.implicits._
    //转换为DF
    val scoreDF: DataFrame = rowRDD.toDF( "name", "class", "score")
    //虚拟表
    scoreDF.createOrReplaceTempView("scores")
    scoreDF.show()

    spark.sql("select count(name) from scores").show()

    //over()
    spark.sql("select name,class,score,count(name) over() name_count from scores").show()

    //over(partition by ..)
    spark.sql("select name,class,score,count(name) over(partition by class) name_count from scores").show()

    //row_number() over(order by..) rank
    spark.sql("select name,class,score,row_number() over(order by score desc) rank from scores").show()
    spark.sql("select name,class,score,row_number() over(partition by class order by score desc) rank from scores")

    //rank() (order by ...)跳跃排序
    spark.sql("select name,class,score,rank() over(order by score desc) rank from scores").show()
    spark.sql("select name,class,score,rank() over(partition by class order by score desc) rank from scores").show()

      //DENSE_RANK  连续排序
    spark.sql("select name,class,score,dense_rank() over(order by score desc) rank from scores").show()
    spark.sql("select name,class,score,dense_rank() over(partition by class order by score desc) rank from scores").show()

     //ntile 分组排名
    spark.sql("select name, class, score, ntile(6) over(order by score desc) rank from scores").show()
    val frame: DataFrame = spark.sql("select name, class, score, ntile(6) over(partition by class order by score desc) rank from scores")
        //写入数据到本地
        frame.write.csv("C:\\Users\\myuser\\IdeaProjects\\spark01\\src\\main\\scala\\com\\mydemo\\csv")
        frame.rdd.saveAsTextFile("C:\\Users\\myuser\\IdeaProjects\\spark01\\src\\main\\scala\\com\\mydemo\\text")

    sc.stop()
    spark.stop()
  }
}

hadoop本地环境配置链接
链接: https://blog.csdn.net/myanddream/article/details/106192396

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值