SparkSql 函数的使用

最新推荐文章于 2024-07-29 14:05:37 发布

bigdataCoding

最新推荐文章于 2024-07-29 14:05:37 发布

阅读量2.3k

点赞数

分类专栏： Spark 文章标签： Spark 函数对象大数据

本文链接：https://blog.csdn.net/UnionIBM/article/details/52517437

版权

Spark 专栏收录该内容

20 篇文章 0 订阅

订阅专栏

转载link：http://www.cnblogs.com/BYRans/p/5005342.html  写的不错

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.functions._

/**
  * 
  */
object sparkDataframe {


  def main(args: Array[String]): Unit = {


    val conf = new SparkConf().setMaster("local[*]").setAppName("anti_Join")

    val sqlcontext = new SQLContext(new SparkContext(conf))

    import sqlcontext.implicits._

    val scoreDF = Seq((1, "sk", 99), (2, "jim", 72), (1, "sk", 99)).toDF("id", "name", "score")

    val stuDF = Seq((1, "sk12", 99), (2, "9jim", ""), (3, "jiem", 82)).toDF("id", "name", "score")

   
  implicit val df = Seq(
  ("1", "This is my country", Seq(1, 2), "2016-09-21"),
  ("2", "我们热爱自己的祖国", Seq(3, 4), "2016-09-21"),
  ("3", "劳动人民最可爱", Seq(4, 5), "2016-09-21"),
  ("4", "劳动人民最可爱", Seq(7, 9), "2016-09-21")
  ).toDF("id", "name", "agelist", "time")

    val sname = scoreDF.mapPartitions(r => {
      r.map(ro => {
        ro.getAs[String]("name")
      })
    })

    val names = sname.take(3).toSeq

    //判断是否存在
    //stuDF.filter(!$"name".isin(names: _*)).show()

    //like并不类似与模糊查询
    //stuDF.filter($"name".like("s")).show()

    //RLIKE正则表达式
    stuDF.filter($"name".rlike("""[A-Za-z]+$""")).show()


    //cast支持的索引类型
    /*
    *
    * Casts the column to a different data type, using the canonical string representation
   * of the type. The supported types are: `string`, `boolean`, `byte`, `short`, `int`, `long`,
   * `float`, `double`, `decimal`, `date`, `timestamp`.
    * */

    //scoreDF.selectExpr("cast(score as double)","name").show()
    //scoreDF.select($"score".cast("double").as("nscore"), $"name").show()

    //添加列名并设置默认值
    //scoreDF.select($"name").withColumn("city", lit("ShangHai")).show()

    //left join  coalesce 如果为null给默认值
    scoreDF.as("a").join(stuDF.as("b"), $"a.id" === $"b.id")
      .select($"a.name", coalesce($"b.score", lit(0).as("score")))
       .show()

  //读出schema的字段然后去掉不需要的字段，filterNot方法

  val names = df.schema.fieldNames.filterNot(_ == "agelist")
  //data_format时间格式,如date_format($"time","yyyyMMdd")  coalesce只能为Null的情况
  df.select(date_format($"time", "dd"),coalesce($"name",lit("劳动"))).show()
  //连接各个字段
  df.selectExpr(s"concat_ws('::::',${names.mkString(",")}) line").show()

  df.select(concat_ws("&", $"id", $"name").as("data")).show()
  //map collect数据到driver端，以防序列化到问题
  val ids: Array[String] = df.map(_.getAs[String]("id")).collect()

  //对数据进行广播
 val broadcastNames = sqlContext.sparkContext.broadcast(names)


  }}