spark sql 技术说明与常见的操作(其三)

最新推荐文章于 2024-11-08 10:28:53 发布

GYY22897

最新推荐文章于 2024-11-08 10:28:53 发布

阅读量172

点赞数

文章标签：大数据 scala

原文链接：http://www.cnblogs.com/shi-qi/articles/10787988.html

版权

scala 内置函数
  1,DataFrame API之中的内置函数进行了优化,不再返回一个结果,而是返回一个 Column对象,并且在并行作业之中
  2, Column 可以用来在 DataFrame 的操作之中,比如 select filter和 groupBy计算
  3, scala 内置函数分为 聚合函数,集合函数(例如,array_contains),日期时间函数,混合函数(例如:求随机值rand)等等

package day02
import org.apache.spark.sql.types.{StructType,StructField,StringType,IntegerType}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.functions._  // 内置函数位置
object polymerization {
  def main(args: Array[String]): Unit = {
      val conf =new SparkConf().setAppName("function")
        .setMaster("local[*]")
      val sc = new SparkContext()
      val sqlContext = new SQLContext(sc)
    //使用 sqlContext 内置函数需要使用隐式转换
      import sqlContext.implicits._
      // 日期 与用户 id
      val userAccessLog = Array(
        "2016-12-1,1133,13",
        "2016-12-1,1234,13",
        "2016-12-2,1131,1",
        "2016-12-1,1133")  // 此处缺少一项
    // // 对缺少的数据进行过滤
    val filterUserSaleRDD = userAccessLog.filter(log =>
      if(log.split(",").length==3) true else false
    )
    // 构造 RDD
    val userAccessRDD= sc.makeRDD(filterUserSaleRDD,3)
    // 将 普通的 RDD 转换成 Row 的 RDD
    val userAccessLogRDD =userAccessRDD.map(
      log => Row(log.split(",")(0),
        log.split(",")(1).toInt)
    )
      // 构造 DataFrame 元数据
     val structType = StructType(Array(
       StructField("date", StringType, true),
       StructField("userid", IntegerType, true)))
    // 使用 sqlContext 创建 DataFrame
     val userAccrssLogRowDF= sqlContext.createDataFrame(
       userAccessLogRDD,structType)
    // 按照 date 进行聚合(agg),每一组的 userid 进行去重,并统计总数(countDistinct)
    // agg 里面是 单引号
    userAccrssLogRowDF.groupBy("date")
      .agg('date,countDistinct('userid))
      .collect()
      .foreach(println)
    /*
    [2016-12-1,2016-12-1,2]  [2016-12-2,2016-12-2,1]
    * */}}

spark 自定义函数:
  1,spark UDF 是针对每行元素,返回一个输出
  2,spark UDAF 是针对多行输入,进行聚合计算，返回一个输出,功能更加强大

package Day3
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame, Row, SQLContext, SparkSession}
object UDF{
  def main(args: Array[String]): Unit = {
    val conf =new SparkConf().setAppName("UDAF").setMaster("local")
    val sc = new SparkContext(conf)
    val sqlcontext = new SQLContext(sc)

    val name = Array("张三","李四","王五","孙二麻子","王五")
    val nameRDD = sc.makeRDD(name,3)
    val namerowRdd = nameRDD.map(name =>Row(name))
    // 构造元数据
    val structType = StructType(Array(StructField("name",StringType,true)))
    val namesDF = sqlcontext.createDataFrame(namerowRdd,structType)

    // 注册表
    namesDF.registerTempTable("names")
    //定义和注册自定义函数 strLen 参数为 : 函数名 与 匿名函数(求字符串的长度)
    sqlcontext.udf.register("strLen",(str:String)=>str.length)

    sqlcontext.sql("select name ,strLen(name) from names")
      .collect().foreach(names=>println(names))
    /*
    [张三,2] [李四,2] [王五,2] [孙二麻子,4] [王五,2]
    */}}