1.本实例主要假设有两个数据源,一个是使用了SparkSQL的API读取JSON文件,另一个数据源来自于自己构造的数据信息(实例中的peopleInformation)。
2.实例演示了JSON文件的读取,和RDD的JOIN操作,以及Schema,RDD和ROW之间的有趣关系;
3.具体实例如下:
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.types.DataTypes
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.RowFactory
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.types.IntegerType
object SparkSQLWithJoin {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("SparkSQLWithJoin");
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
//1.从json文件中读取json并生成DataFrame
val df = sqlContext.read.json("xxxxxx\\spark-1.6.0-bin-hadoop2.6\\examples\\src\\main\\resources\\peoples.json")
//2.将DataFrame注册成临时表
df.registerTempTable("peopleScores")
//3.执行SparkSQL查找出成绩>90的人的姓名和成绩
val execellentStudentDF = sqlContext.sql("select name,score from peopleScores where score > 90")
//4.将名字提取数组
val execellentStudent = execellentStudentDF.rdd.map(row => row(0)).collect()
//5.模拟jsons文件生成
val peopleInformation = Array("{\"name\":\"Michael\", \"age\":20}","{\"name\":\"Andy\", \"age\":17}","{\"name\":\"Justin\", \"age\":19}","{\"name\":\"zhangsan\", \"age\":25}")
for(item <- peopleInformation) println(item)
//6.将json读进来的Array转换成RDD
val peopleInformationRDD = sc.parallelize(peopleInformation)
//7.将RDD转换成DataFrame
val peopleInformationDF = sqlContext.read.json(peopleInformationRDD)
//8.注册临时表
peopleInformationDF.registerTempTable("peopleInformations")
//9.拼接sql
var sqltext = "select name,age from peopleInformations where name in ("
for(i <- 0 until execellentStudent.size) {
sqltext += "'" + execellentStudent(i) + "'"
if(i < execellentStudent.size - 1) {
sqltext += ","
}
}
sqltext += ")"
println(sqltext)
//10.执行sql提取DataFrame
val execellentNameAgeDF = sqlContext.sql(sqltext)
//11.将execellentNameAgeDF转换成RDD并执行join
val execellentNameAgeRDD = execellentNameAgeDF.rdd.map(row => (row.getString(0),row.getLong(1).toInt))
val execellentNameScoresRDD = execellentStudentDF.rdd.map(row => (row.getString(0),row.getLong(1).toInt))
val resultRDD = execellentNameAgeRDD join execellentNameScoresRDD
val resultRowRDD = resultRDD.map(item => Row(item._1,item._2._1,item._2._1))
import scala.collection.mutable.ArrayBuffer
//12.构建schema
val schema = StructType(StructField("name",StringType,true)::StructField("age",IntegerType,true)::StructField("score",IntegerType,true):: Nil)
val personInfoDF = sqlContext.createDataFrame(resultRowRDD, schema)
personInfoDF.show
val resultList = personInfoDF.rdd.collect()
for(item <- resultList) println(item)
}
}