DataFrame数据操作以及与RDD互相操作案例

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/YaboSun/article/details/80690194

Dataframe基本操作

主要对于DataFrame学习的一些简单操作,代码如下:

import org.apache.spark.sql.SparkSession

/**
  * DataFrame 基本操作
  */
object DataFrameApp {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder()
      .master("local[2]")
      .appName("DataFrameApp")
      .getOrCreate()
    val peopleDF = spark.read.format("json").load("file:///usr/local/spark/examples/src/main/resources/people.json")

    //输出dataframe对应的schema
    peopleDF.printSchema()

    //输出前20条信息(默认值),可以指定
    peopleDF.show()

    //查询某列所有的数据 :select name from table
    peopleDF.select("name").show()

    //查询某几列的数据并做相应计算:select name, age + 10 as new age from table
    peopleDF.select(peopleDF.col("name"), (peopleDF.col("age") + 10).as("new age")).show()

    //查询年龄大于19的 : select * from table where age > 19
    peopleDF.filter(peopleDF.col("age") > 19).show()

    //根据某一列进行分组,然后再进行聚合操作 select age, count(1) from table group by age
    peopleDF.groupBy("age").count().show()

    spark.stop()
  }
}

进阶操作

DataFrame与RDD互相操作—反射方式

import org.apache.spark.sql.SparkSession

/**
  * DataFrame与RDD互相操作实现---反射实现
  */
object DataFrameRDDApp {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder()
      .appName("DataFrameRDDApp")
      .master("local[2]")
      .getOrCreate()

    //RDD => DataFrame
    val rdd = spark.sparkContext.textFile("/home/hadoop/infos.txt")

    //注意:需要导入隐式转换
    import spark.implicits._
    val infoDF = rdd.map(_.split(",")).map(line => Info(line(0).toInt, line(1), line(2).toInt)).toDF
    infoDF.show()

    //进行简单查询操作
    infoDF.filter(infoDF.col("age") > 30).show()
    //通过转化为info表然后调用sqlAPI进行操作
    infoDF.createOrReplaceTempView("info")
    spark.sql("select * from info where age > 30").show()

    spark.stop()
  }

  case class Info(id: Int, name: String, age: Int)
}

DataFrame与RDD互相操作—编程方式

def program(spark: SparkSession): Unit = {
    /**
      * RDD => DataFrame 编程方式
      */

    // Create an RDD
    val rdd = spark.sparkContext.textFile("/home/hadoop/infos.txt")

    val infoRDD = rdd.map(_.split(",")).map(line => Row(line(0).toInt, line(1), line(2).toInt))

    val structType = StructType(Array(StructField("id", IntegerType, true),
      StructField("name", StringType, true),
      StructField("age", IntegerType, true)))

    val infoDF = spark.createDataFrame(infoRDD, structType)
    infoDF.printSchema()
    infoDF.show()
  }

参考:
http://spark.apache.org/docs/latest/sql-programming-guide.html#datasets-and-dataframes

阅读更多
想对作者说点什么? 我来说一句

没有更多推荐了,返回首页