第6章 DateFrame&Dataset

6-1 -课程目录

 

6-2 -DataFrame产生背景

DataFrame它不是spark SQL提出的,而是早期在R,Pandas语言就已经有了的。

6-3 -DataFrame概述

 

6-4 -DataFrame和RDD的对比

 

RDD:

java/scala==>jvm

python==>python runtime

DataFrame

java/scala/python==>login Plan

 

6-5 -DataFrame基本API操作

 

参考代码:

https://gitee.com/sag888/big_data/blob/master/%E4%BB%A5%E6%85%95%E8%AF%BE%E7%BD%91%E6%97%A5%E5%BF%97%E5%88%86%E6%9E%90%E4%B8%BA%E4%BE%8B%20%E8%BF%9B%E5%85%A5%E5%A4%A7%E6%95%B0%E6%8D%AE%20Spark%20SQL%20%E7%9A%84%E4%B8%96%E7%95%8C/project/p1867y/ImoocSparkSQLProject/src/main/scala/com/imooc/spark/DataFrameApp.scala

package com.imooc.spark

import org.apache.spark.sql.SparkSession

/**

* DataFrame API基本操作

*/

object DataFrameApp {

def main(args: Array[String]) {

val spark = SparkSession.builder().appName("DataFrameApp").master("local[2]").getOrCreate()

// 将json文件加载成一个dataframe

val peopleDF = spark.read.format("json").load("file:///Users/rocky/data/people.json")

// 输出dataframe对应的schema信息

peopleDF.printSchema()

// 输出数据集的前20条记录

peopleDF.show()

//查询某列所有的数据: select name from table

peopleDF.select("name").show()

// 查询某几列所有的数据,并对列进行计算: select name, age+10 as age2 from table

peopleDF.select(peopleDF.col("name"), (peopleDF.col("age") + 10).as("age2")).show()

//根据某一列的值进行过滤: select * from table where age>19

peopleDF.filter(peopleDF.col("age") > 19).show()

//根据某一列进行分组,然后再进行聚合操作: select age,count(1) from table group by age

peopleDF.groupBy("age").count().show()

spark.stop()

}

}

 

 

6-6 -DataFrame与RDD互操作方式一

 

源码地址:

 

https://gitee.com/sag888/big_data/blob/master/%E4%BB%A5%E6%85%95%E8%AF%BE%E7%BD%91%E6%97%A5%E5%BF%97%E5%88%86%E6%9E%90%E4%B8%BA%E4%BE%8B%20%E8%BF%9B%E5%85%A5%E5%A4%A7%E6%95%B0%E6%8D%AE%20Spark%20SQL%20%E7%9A%84%E4%B8%96%E7%95%8C/project/p1867y/ImoocSparkSQLProject/src/main/scala/com/imooc/spark/DataFrameRDDApp.scala

源码:

 

package com.imooc.spark

import org.apache.spark.sql.types.{StringType, IntegerType, StructField, StructType}

import org.apache.spark.sql.{Row, SparkSession}

/**

* DataFrame和RDD的互操作

*/

object DataFrameRDDApp {

def main(args: Array[String]) {

val spark = SparkSession.builder().appName("DataFrameRDDApp").master("local[2]").getOrCreate()

//inferReflection(spark)

program(spark)

spark.stop()

}

def program(spark: SparkSession): Unit = {

// RDD ==> DataFrame

val rdd = spark.sparkContext.textFile("file:///Users/rocky/data/infos.txt")

val infoRDD = rdd.map(_.split(",")).map(line => Row(line(0).toInt, line(1), line(2).toInt))

val structType = StructType(Array(StructField("id", IntegerType, true),

StructField("name", StringType, true),

StructField("age", IntegerType, true)))

val infoDF = spark.createDataFrame(infoRDD,structType)

infoDF.printSchema()

infoDF.show()

//通过df的api进行操作

infoDF.filter(infoDF.col("age") > 30).show

//通过sql的方式进行操作

infoDF.createOrReplaceTempView("infos")

spark.sql("select * from infos where age > 30").show()

}

def inferReflection(spark: SparkSession) {

// RDD ==> DataFrame

val rdd = spark.sparkContext.textFile("file:///Users/rocky/data/infos.txt")

//注意:需要导入隐式转换

import spark.implicits._

val infoDF = rdd.map(_.split(",")).map(line => Info(line(0).toInt, line(1), line(2).toInt)).toDF()

infoDF.show()

infoDF.filter(infoDF.col("age") > 30).show

infoDF.createOrReplaceTempView("infos")

spark.sql("select * from infos where age > 30").show()

}

case class Info(id: Int, name: String, age: Int)

}

 

6-7 dataframe与rdd互操作方式

 

源码地址

 

https://gitee.com/sag888/big_data/blob/master/%E4%BB%A5%E6%85%95%E8%AF%BE%E7%BD%91%E6%97%A5%E5%BF%97%E5%88%86%E6%9E%90%E4%B8%BA%E4%BE%8B%20%E8%BF%9B%E5%85%A5%E5%A4%A7%E6%95%B0%E6%8D%AE%20Spark%20SQL%20%E7%9A%84%E4%B8%96%E7%95%8C/project/p1867y/ImoocSparkSQLProject/src/main/scala/com/imooc/spark/DataFrameRDDApp.scala

 

源码:

package com.imooc.spark

import org.apache.spark.sql.types.{StringType, IntegerType, StructField, StructType}

import org.apache.spark.sql.{Row, SparkSession}

/**

* DataFrame和RDD的互操作

*/

object DataFrameRDDApp {

def main(args: Array[String]) {

val spark = SparkSession.builder().appName("DataFrameRDDApp").master("local[2]").getOrCreate()

//inferReflection(spark)

program(spark)

spark.stop()

}

def program(spark: SparkSession): Unit = {

// RDD ==> DataFrame

val rdd = spark.sparkContext.textFile("file:///Users/rocky/data/infos.txt")

val infoRDD = rdd.map(_.split(",")).map(line => Row(line(0).toInt, line(1), line(2).toInt))

val structType = StructType(Array(StructField("id", IntegerType, true),

StructField("name", StringType, true),

StructField("age", IntegerType, true)))

val infoDF = spark.createDataFrame(infoRDD,structType)

infoDF.printSchema()

infoDF.show()

//通过df的api进行操作

infoDF.filter(infoDF.col("age") > 30).show

//通过sql的方式进行操作

infoDF.createOrReplaceTempView("infos")

spark.sql("select * from infos where age > 30").show()

}

def inferReflection(spark: SparkSession) {

// RDD ==> DataFrame

val rdd = spark.sparkContext.textFile("file:///Users/rocky/data/infos.txt")

//注意:需要导入隐式转换

import spark.implicits._

val infoDF = rdd.map(_.split(",")).map(line => Info(line(0).toInt, line(1), line(2).toInt)).toDF()

infoDF.show()

infoDF.filter(infoDF.col("age") > 30).show

infoDF.createOrReplaceTempView("infos")

spark.sql("select * from infos where age > 30").show()

}

case class Info(id: Int, name: String, age: Int)

}

 

 


6-8 -DataFrame API操作案例实战

 

学生信息统计案例

源文件:student.data

 

源码地址:

https://gitee.com/sag888/big_data/blob/master/%E4%BB%A5%E6%85%95%E8%AF%BE%E7%BD%91%E6%97%A5%E5%BF%97%E5%88%86%E6%9E%90%E4%B8%BA%E4%BE%8B%20%E8%BF%9B%E5%85%A5%E5%A4%A7%E6%95%B0%E6%8D%AE%20Spark%20SQL%20%E7%9A%84%E4%B8%96%E7%95%8C/project/p1867y/ImoocSparkSQLProject/src/main/scala/com/imooc/spark/DataFrameCase.scala

package com.imooc.spark

import org.apache.spark.sql.SparkSession

/**

* DataFrame中的操作操作

*/

object DataFrameCase {

def main(args: Array[String]) {

val spark = SparkSession.builder().appName("DataFrameRDDApp").master("local[2]").getOrCreate()

// RDD ==> DataFrame

val rdd = spark.sparkContext.textFile("file:///Users/rocky/data/student.data")

//注意:需要导入隐式转换

import spark.implicits._

val studentDF = rdd.map(_.split("\\|")).map(line => Student(line(0).toInt, line(1), line(2), line(3))).toDF()

//show默认只显示前20条

studentDF.show

studentDF.show(30)

studentDF.show(30, false)

studentDF.take(10)

studentDF.first()

studentDF.head(3)

studentDF.select("email").show(30,false)

studentDF.filter("name=''").show

studentDF.filter("name='' OR name='NULL'").show

//name以M开头的人

studentDF.filter("SUBSTR(name,0,1)='M'").show

studentDF.sort(studentDF("name")).show

studentDF.sort(studentDF("name").desc).show

studentDF.sort("name","id").show

studentDF.sort(studentDF("name").asc, studentDF("id").desc).show

studentDF.select(studentDF("name").as("student_name")).show

val studentDF2 = rdd.map(_.split("\\|")).map(line => Student(line(0).toInt, line(1), line(2), line(3))).toDF()

studentDF.join(studentDF2, studentDF.col("id") === studentDF2.col("id")).show

spark.stop()

}

case class Student(id: Int, name: String, phone: String, email: String)

}

 

 

6-9 -Dataset概述及使用

 

源码地址:

https://gitee.com/sag888/big_data/blob/master/%E4%BB%A5%E6%85%95%E8%AF%BE%E7%BD%91%E6%97%A5%E5%BF%97%E5%88%86%E6%9E%90%E4%B8%BA%E4%BE%8B%20%E8%BF%9B%E5%85%A5%E5%A4%A7%E6%95%B0%E6%8D%AE%20Spark%20SQL%20%E7%9A%84%E4%B8%96%E7%95%8C/project/p1867y/ImoocSparkSQLProject/src/main/scala/com/imooc/spark/DatasetApp.scala

 

package com.imooc.spark

import org.apache.spark.sql.SparkSession

/**

* Dataset操作

*/

object DatasetApp {

def main(args: Array[String]) {

val spark = SparkSession.builder().appName("DatasetApp")

.master("local[2]").getOrCreate()

//注意:需要导入隐式转换

import spark.implicits._

val path = "file:///Users/rocky/data/sales.csv"

//spark如何解析csv文件?

val df = spark.read.option("header","true").option("inferSchema","true").csv(path)

df.show

val ds = df.as[Sales]

ds.map(line => line.itemId).show

spark.sql("seletc name from person").show

//df.seletc("name")

df.select("nname")

ds.map(line => line.itemId)

spark.stop()

}

case class Sales(transactionId:Int,customerId:Int,itemId:Int,amountPaid:Double)

}

 

 

 

 

 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值