SparkSQL之创建DataFrame

最新推荐文章于 2023-11-30 09:36:05 发布

维维weiwei

最新推荐文章于 2023-11-30 09:36:05 发布

阅读量775

点赞数

分类专栏： Spark生态系统

本文链接：https://blog.csdn.net/tangshiweibbs/article/details/70239308

版权

Spark生态系统专栏收录该内容

24 篇文章 0 订阅

订阅专栏

def main(args: Array[String]): Unit = {
  val conf = new SparkConf().setMaster("local").setAppName("ScalaDataFrame")
  val sc = new SparkContext(conf)
  val sqlContext = new SQLContext(sc)
  val listRDD = sc.parallelize(List("zhangsan 13 168.5", "lisi 14 175.3", "wangwu 15 176.3"))

  //    createDataFrame_1(listRDD, sqlContext)
  //    createDataFrame_2(listRDD, sqlContext)
  //    createDataFrame_3(listRDD, sqlContext)
  createDataFrame_4(listRDD, sqlContext)
  sc.stop()
}

/**
  * 将一个DataFrame注册成为一张临时表，在临时表上面执行sql的操作，方便我们的查询等操作
  */
def createDataFrame_4(listRDD: RDD[String], sqlContext: SQLContext) = {
  /**
    * 因为我们要使用sql的方式来操作SparkSQL对应的DataFrame
    * 应该把该DataFrame转化成为一张表，所以需要注册该DataFrame成为系统中的一张临时表
    */
  val df = createDataFrame_3(listRDD, sqlContext)
  df.registerTempTable("person")
  /**
    * 注册成为一张临时表之后，直接进行sql的操作，
    * 使用SQLContext，而不是df进行操作
    */

  val sqlDF = sqlContext.sql("select name, age, height from person where age > 14")
  sqlDF.show()
}

/**
  * 使用SQLContext的隐士转化快速将一个RDD转化为一个DataFrame
  */
def createDataFrame_3(listRDD: RDD[String], sqlContext: SQLContext): DataFrame = {
  import sqlContext.implicits._
  //这个隐士转换必须要有
  val rowRDD = listRDD.map(line => {
    val splits = line.split(" ")
    val name = splits(0).trim
    val age = splits(1).trim.toInt
    val height = splits(2).trim.toDouble
    (name, age, height)
  })
  /**
    * 第三中创建DataFrame的方式，使用sqlContext内部的一个隐士转换，
    * 增强普通RDD的功能
    * rdd.toDF(cols:String*)
    * col是对该rdd中所包含的数据的一个说明，简单认为就是转化为DataFrame之后给每个列
    * 起的一个列名
    */
  val df = rowRDD.toDF("name", "age", "height")
  //    df.printSchema()
  //    df.show()
  df
}

def createDataFrame_2(listRDD: RDD[String], sqlContext: SQLContext): Unit = {
  val personRDD: RDD[Person] = listRDD.map(line => {
    val splits = line.split(" ")
    val name = splits(0).trim
    val age = splits(1).trim.toInt
    val height = splits(2).trim.toDouble
    new Person(name, age, height)
  })

  /**
    * scala中通过反射的方式将一个RDD，转换为一个DataFrame
    * 在java中获取一个类的字节码Class对象，直接类名.class
    * 在scala中需要通过classOf[类名]的方法
    */
  val df = sqlContext.createDataFrame(personRDD, classOf[Person])
  df.printSchema()

  df.show()
}

/**
  * 在代码中动态生成DataFrame
  */
def createDataFrame_1(listRDD: RDD[String], sqlContext: SQLContext): Unit = {
  //---->将RDD---->DataFrame来进行一个操作
  val rowRDD: RDD[Row] = listRDD.map(line => {
    val splits = line.split(" ")
    val name = splits(0).trim
    val age = splits(1).trim.toInt
    val height = splits(2).trim.toDouble
    Row(name, age, height)
  })
  //通过动态编码的方式
  val structType = StructType(Array(
    StructField("name", DataTypes.StringType, true),
    StructField("age", DataTypes.IntegerType, true),
    StructField("height", DataTypes.DoubleType, true)
  ))
  val df = sqlContext.createDataFrame(rowRDD, structType)
  df.show()
}

维维weiwei

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
SparkSQL之创建DataFrame

///定义一个RDDJavaRDD listRDD = sc.parallelize(Arrays.asList( new Person("张三", 13, 168.8), new Person("李四", 14, 169.8), new Person("王五", 15, 175.8), new Person("赵六", 16, 1
复制链接

扫一扫