DataFrame与RDD的互操作
/**
* DataFrame和RDD的互操作
*/
object DataFrameRDDApp {
def main(args: Array[String]) {
val spark = SparkSession.builder().appName("DataFrameRDDApp").master("local[2]").getOrCreate()
//inferReflection(spark)
program(spark)
spark.stop()
}
def program(spark: SparkSession): Unit = {
// RDD ==> DataFrame
val rdd = spark.sparkContext.textFile("file:///Users/data/infos.txt")
val infoRDD = rdd.map(_.split(",")).map(line => Row(line(0).toInt, line(1), line(2).toInt))
val structType = StructType(Array(StructField("id", IntegerType, true),
StructField("name", StringType, true),
StructField("age", IntegerType, true)))
val infoDF = spark.createDataFrame(infoRDD,structType)
infoDF.printSchema()
infoDF.show()
//通过df的api进行操作
infoDF.filter(infoDF.col("age") > 30).show
//通过sql的方式进行操作
infoDF.createOrReplaceTempView("infos")
spark.sql("select * from infos where age > 30").show()
}
def inferReflection(spark: SparkSession) {
// RDD ==> DataFrame
val rdd = spark.sparkContext.textFile("file:///Users/data/infos.txt")
//注意:需要导入隐式转换
import spark.implicits._
val infoDF = rdd.map(_.split(",")).map(line => Info(line(0).toInt, line(1), line(2).toInt)).toDF()
infoDF.show()
infoDF.filter(infoDF.col("age") > 30).show
infoDF.createOrReplaceTempView("infos")
spark.sql("select * from infos where age > 30").show()
}
case class Info(id: Int, name: String, age: Int)
}
DataFrame API操作
/**
* DataFrame API基本操作
*/
object DataFrameApp {
def main(args: Array[String]) {
val spark = SparkSession.builder().appName("DataFrameApp").master("local[2]").getOrCreate()
// 将json文件加载成一个dataframe
val peopleDF = spark.read.format("json").load("file:///Users/data/people.json")
// 输出dataframe对应的schema信息
peopleDF.printSchema()
// 输出数据集的前20条记录
peopleDF.show()
//查询某列所有的数据: select name from table
peopleDF.select("name").show()
// 查询某几列所有的数据,并对列进行计算: select name, age+10 as age2 from table
peopleDF.select(peopleDF.col("name"), (peopleDF.col("age") + 10).as("age2")).show()
//根据某一列的值进行过滤: select * from table where age>19
peopleDF.filter(peopleDF.col("age") > 19).show()
//根据某一列进行分组,然后再进行聚合操作: select age,count(1) from table group by age
peopleDF.groupBy("age").count().show()
spark.stop()
}
}
/**
* DataFrame中的操作操作
*/
object DataFrameCase {
def main(args: Array[String]) {
val spark = SparkSession.builder().appName("DataFrameRDDApp").master("local[2]").getOrCreate()
// RDD ==> DataFrame
val rdd = spark.sparkContext.textFile("file:///Users/data/student.data")
//注意:需要导入隐式转换
import spark.implicits._
val studentDF = rdd.map(_.split("\\|")).map(line => Student(line(0).toInt, line(1), line(2), line(3))).toDF()
//show默认只显示前20条
studentDF.show
studentDF.show(30)
studentDF.show(30, false)
studentDF.take(10)
studentDF.first()
studentDF.head(3)
studentDF.select("email").show(30,false)
studentDF.filter("name=''").show
studentDF.filter("name='' OR name='NULL'").show
//name以M开头的人
studentDF.filter("SUBSTR(name,0,1)='M'").show
studentDF.sort(studentDF("name")).show
studentDF.sort(studentDF("name").desc).show
studentDF.sort("name","id").show
studentDF.sort(studentDF("name").asc, studentDF("id").desc).show
studentDF.select(studentDF("name").as("student_name")).show
val studentDF2 = rdd.map(_.split("\\|")).map(line => Student(line(0).toInt, line(1), line(2), line(3))).toDF()
studentDF.join(studentDF2, studentDF.col("id") === studentDF2.col("id")).show
spark.stop()
}
case class Student(id: Int, name: String, phone: String, email: String)
}
DataSet操作
/**
* Dataset操作
*/
object DatasetApp {
def main(args: Array[String]) {
val spark = SparkSession.builder().appName("DatasetApp")
.master("local[2]").getOrCreate()
//注意:需要导入隐式转换
import spark.implicits._
val path = "file:///Users/data/sales.csv"
//spark如何解析csv文件
//header true 告诉spark 该文件有表头
val df = spark.read.option("header","true").option("inferSchema","true").csv(path)
df.show
val ds = df.as[Sales]
ds.map(line => line.itemId).show
spark.sql("seletc name from person").show
//df.seletc("name")
df.select("nname")
ds.map(line => line.itemId)
spark.stop()
}
case class Sales(transactionId:Int,customerId:Int,itemId:Int,amountPaid:Double)
}
访问外部数据源
方便快速从不同的数据源(json、parquet、rdbms),经过混合处理(json join parquet),再将处理结果以特定的格式(json、parquet)写回到指定的系统(HDFS、S3)上去
内置数据源:json/parquet/jdbc/csv(2.0+版本提供)
处理parquet数据
/**
* Parquet文件操作
*/
object ParquetApp {
def main(args: Array[String]) {
val spark = SparkSession.builder().appName("SparkSessionApp")
.master("local[2]").getOrCreate()
/**
* spark.read.format("parquet").load 这是标准写法
*/
val userDF = spark.read.format("parquet").load("file:///home/hadoop/app/spark-2.1.0-bin-2.6.0-cdh5.7.0/examples/src/main/resources/users.parquet")
userDF.printSchema()
userDF.show()
userDF.select("name","favorite_color").show
###写文件 userDF.select("name","favorite_color").write.format("json").save("file:///home/hadoop/tmp/jsonout")
spark.read.load("file:///home/hadoop/app/spark-2.1.0-bin-2.6.0-cdh5.7.0/examples/src/main/resources/users.parquet").show
//会报错,因为sparksql默认处理的format就是parquet
spark.read.load("file:///home/hadoop/app/spark-2.1.0-bin-2.6.0-cdh5.7.0/examples/src/main/resources/people.json").show
spark.read.format("parquet").option("path","file:///home/hadoop/app/spark-2.1.0-bin-2.6.0-cdh5.7.0/examples/src/main/resources/users.parquet").load().show
spark.stop()
}
}
hive与mysql数据源的使用
/**
* 使用外部数据源综合查询Hive和MySQL的表数据
*/
object HiveMySQLApp {
def main(args: Array[String]) {
val spark = SparkSession.builder().appName("HiveMySQLApp")
.master("local[2]").getOrCreate()
// 加载Hive表数据
val hiveDF = spark.table("dept")
// 加载MySQL表数据
val mysqlDF = spark.read.format("jdbc").option("url", "jdbc:mysql://192.168.7.2:3306").option("dbtable", "sparksql.dept").option("user", "root").option("password", "root").option("driver", "com.mysql.jdbc.Driver").load()
// JOIN
val resultDF = hiveDF.join(mysqlDF, hiveDF.col("deptno") === mysqlDF.col("DEPTNO"))
resultDF.show
resultDF.select(hiveDF.col("empno"),hiveDF.col("ename"),
mysqlDF.col("deptno"), mysqlDF.col("dname")).show
spark.stop()
}
}
## 运行结果
+------+------------+----------+------+----------+--------+
|deptno| dname| loc|DEPTNO| DNAME| LOC|
+------+------------+----------+------+----------+--------+
| 20| 'RESEARCH'| 'DALLAS'| 20| RESEARCH| DALLAS|
| 40|'OPERATIONS'| 'BOSTON'| 40|OPERATIONS| BOSTON|
| 10|'ACCOUNTING'|'NEW YORK'| 10|ACCOUNTING|NEW YORK|
| 30| 'SALES'| 'CHICAGO'| 30| SALES| CHICAGO|
+------+------------+----------+------+----------+--------+
写文件
DataFrame.write.save方法或saveAsTable 可以写会到文件或表