SparkSql 函数的使用


转载link:http://www.cnblogs.com/BYRans/p/5005342.html  写的不错

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.functions._

/**
  * 
  */
object sparkDataframe {


  def main(args: Array[String]): Unit = {


    val conf = new SparkConf().setMaster("local[*]").setAppName("anti_Join")

    val sqlcontext = new SQLContext(new SparkContext(conf))

    import sqlcontext.implicits._

    val scoreDF = Seq((1, "sk", 99), (2, "jim", 72), (1, "sk", 99)).toDF("id", "name", "score")

    val stuDF = Seq((1, "sk12", 99), (2, "9jim", ""), (3, "jiem", 82)).toDF("id", "name", "score")
   
  implicit val df = Seq(
  ("1", "This is my country", Seq(1, 2), "2016-09-21"),
  ("2", "我们热爱自己的祖国", Seq(3, 4), "2016-09-21"),
  ("3", "劳动人民最可爱", Seq(4, 5), "2016-09-21"),
  ("4", "劳动人民最可爱", Seq(7, 9), "2016-09-21")
  ).toDF("id", "name", "agelist", "time")

    val sname = scoreDF.mapPartitions(r => {
      r.map(ro => {
        ro.getAs[String]("name")
      })
    })

    val names = sname.take(3).toSeq

    //判断是否存在
    //stuDF.filter(!$"name".isin(names: _*)).show()

    //like并不类似与模糊查询
    //stuDF.filter($"name".like("s")).show()

    //RLIKE正则表达式
    stuDF.filter($"name".rlike("""[A-Za-z]+$""")).show()


    //cast支持的索引类型
    /*
    *
    * Casts the column to a different data type, using the canonical string representation
   * of the type. The supported types are: `string`, `boolean`, `byte`, `short`, `int`, `long`,
   * `float`, `double`, `decimal`, `date`, `timestamp`.
    * */

    //scoreDF.selectExpr("cast(score as double)","name").show()
    //scoreDF.select($"score".cast("double").as("nscore"), $"name").show()

    //添加列名并设置默认值
    //scoreDF.select($"name").withColumn("city", lit("ShangHai")).show()

    //left join  coalesce 如果为null给默认值
    scoreDF.as("a").join(stuDF.as("b"), $"a.id" === $"b.id")
      .select($"a.name", coalesce($"b.score", lit(0).as("score")))
       .show()
 
  //读出schema的字段然后去掉不需要的字段,filterNot方法
  val names = df.schema.fieldNames.filterNot(_ == "agelist")
  //data_format时间格式,如date_format($"time","yyyyMMdd")  coalesce只能为Null的情况
  df.select(date_format($"time", "dd"),coalesce($"name",lit("劳动"))).show()
  //连接各个字段
  df.selectExpr(s"concat_ws('::::',${names.mkString(",")}) line").show()

  df.select(concat_ws("&", $"id", $"name").as("data")).show()
  //map collect数据到driver端,以防序列化到问题
  val ids: Array[String] = df.map(_.getAs[String]("id")).collect()

  //对数据进行广播
 val broadcastNames = sqlContext.sparkContext.broadcast(names)


}}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值