DataFrame转化成RDD方式二

简述

反射法 

DataFrame转化为RDD
(1)反射推断包含特定类型的RDD的模式
   条件:已知模式,即列和列的类型
   特点:简单\简洁
   备注:case class 不能和SparkContext同一作用域,也即定义再main 或者object外

(2)通过一个编程接口来实现
   条件:运行前不知道列和列的类型
   特点:允许构造一个模式,在RDD上应用,不简洁

people.txt 

Michael, 29
Andy, 30
Justin, 19
john,20
Herry,19
package com.dt.spark.main.DataFrameToRDDLearn
 
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
/**
  * Created by  on 16/7/17.
  */
 
//定义样本类,此时列名和类型已知
 
case class People(name:String, age:Int)
 
object DataFrameToRDD_1 {
  def main(args: Array[String]) {
 
    val conf = new SparkConf()
    conf.setAppName("test")
    conf.setMaster("local")
 
    val sc = new SparkContext(conf)
 
    //设置日志级别
    Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
    Logger.getLogger("org.apache.spark.sql").setLevel(Level.WARN)
 
 
 
    val sqlContext = new HiveContext(sc)
 
    //==========================================
    /*
    通过反射将DataFrame转化成RDD
     */
    val people = sc.textFile("./src/com/dt/spark/main/DataFrameToRDDLearn/srcFile//people.txt")
 
    //添加隐式转化,利用toDF
 
    import sqlContext.implicits._
 
    val peopleDF = people.map(_.split(",")).map(p=>People(p(0),p(1).trim.toInt)).toDF()
    peopleDF.show()
    //    +-------+---+
    //    |   name|age|
    //    +-------+---+
    //    |Michael| 29|
    //    |   Andy| 30|
    //    | Justin| 19|
    //    |   john| 20|
    //    |  Herry| 19|
    //    +-------+---+
 
 
    //==========================================
    /*
     查看DataFrame和RDD间的关系
     */
    println(peopleDF.rdd.toDebugString)
 
    //    (1) MapPartitionsRDD[6] at rdd at DataFrameToRDD_1.scala:56 []
    //    |  MapPartitionsRDD[4] at rddToDataFrameHolder at DataFrameToRDD_1.scala:39 []
    //    |  MapPartitionsRDD[3] at map at DataFrameToRDD_1.scala:39 []
    //    |  MapPartitionsRDD[2] at map at DataFrameToRDD_1.scala:39 []
    //    |  MapPartitionsRDD[1] at textFile at DataFrameToRDD_1.scala:34 []
    //    |  ./src/com/dt/spark/main/DataFrameToRDDLearn/srcFile//people.txt HadoopRDD[0] at textFile at DataFrameToRDD_1.scala:34 []
 
 
    //==========================================
    /*
    注册临时表,执行SQL查询
     */
 
    peopleDF.registerTempTable("peopletable")
 
    //==========================================
    /*
    查看当前库的信息
     */
 
    sqlContext.sql("show databases").show()
    //    +-------+
    //    | result|
    //    +-------+
    //    |default|
    //    +-------+
 
    //==========================================
    /*
    查看当前库的全部表名
     */
    sqlContext.tableNames().foreach(println(_))
    // peopletable
 
    //==========================================
    /*
     利用sql查询
     */
     val teenagers = sqlContext.sql("select name from peopletable where age >= 13 and age <= 19")
    teenagers.map(t=>"Name: " + t(0)).collect().foreach(println)
 
 
 
    sc.stop()
 
  }
 
}

schema法 

package com.dt.spark.main.DataFrameToRDDLearn
 
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.Row
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.types.{StructField, StringType, StructType}
import org.apache.spark.{SparkConf, SparkContext}
 
 
object DataFrameToRDD_2 {
  def main(args: Array[String]) {
 
    val conf = new SparkConf()
    conf.setAppName("test")
    conf.setMaster("local")
 
    val sc = new SparkContext(conf)
 
    //设置日志级别
    Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
    Logger.getLogger("org.apache.spark.sql").setLevel(Level.WARN)
 
 
 
    val sqlContext = new HiveContext(sc)
 
    //==========================================
    /*
    通过编程指定模式将DataFrame转化成RDD
     */
    val people = sc.textFile("./src/com/dt/spark/main/DataFrameToRDDLearn/srcFile//people.txt")
 
    //添加隐式转化,利用toDF
    //import sqlContext.implicits._
 
    //==========================================
    /*
    指定一个schema包含的列名
     */
    val schemaString = "name age"
 
    //==========================================
    /*
    构建schema
    //import org.apache.spark.sql.types.{StructField, StringType, StructType}
     */
     val schema = StructType(schemaString.split(" ").map(fieldName=>StructField(fieldName,StringType,true)))//org.apache.spark.sql.types.StructType(schemaString.split(" ").map(fieldName=>(fieldName,StringType,true)))
 
 
    //==========================================
    /*
    将RDD转化成DataFrame的Row类型
    将RDD中的每条记录转换成一个行(Row)的实例
     */
    val rowRDD= people.map(_.split(",")).map(p=>Row(p(0),p(1).trim))
 
    val peopleDF = sqlContext.createDataFrame(rowRDD,schema)
 
    peopleDF.registerTempTable("peoletablefromschema")
 
    //==========================================
    /*
    查看当前库的全部表名
     */
    sqlContext.tableNames().foreach(println(_))
    //peoletablefromschema
 
    //==========================================
    /*
     利用sql查询
     */
    val teenagers = sqlContext.sql("select name from peoletablefromschema where age >= 13 and age <= 19")
    teenagers.map(t=>"Name: " + t(0)).collect().foreach(println)
    //    Name: Justin
    //    Name: Herry
 
    sc.stop()
 
  }
 
}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值