package com.spark.sparksql.datasource.scala
import org.apache.spark.sql.types.{StructType, IntegerType, StringType, StructField}
import org.apache.spark.sql.{SaveMode, Row, SQLContext}
import org.apache.spark.{SparkContext, SparkConf}
/**
* Created by root on 2017/8/9.
*/
object JSONDataSource {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("JSONDataSource").setMaster("local")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
import sqlContext.implicits._
val studentScoresDF = sqlContext.read.json("students.json")
studentScoresDF.registerTempTable("student_scores")
val goodStudentNamesDF = sqlContext.sql("select name, score from student_scores where score >= 80")
val goodStudentNames = goodStudentNamesDF.map(x => x(0)).collect()
var studentInfoJSONs = List[String]()
studentInfoJSONs = studentInfoJSONs.::("{\"name\":\"Yasaka\",\"age\":18}")
studentInfoJSONs = studentInfoJSONs.::("{\"name\":\"Xuruyun\",\"age\":17}")
studentInfoJSONs = studentInfoJSONs.::("{\"name\":\"Liangyongqi\",\"age\":19}")
val studentInfosRDD = sc.parallelize(studentInfoJSONs)
val studentInfosDF = sqlContext.read.json(studentInfosRDD)
studentInfosDF.registerTempTable("student_infos")
var sql = "select name, age from student_infos where name in ("
var i=0
for(name <- goodStudentNames){
sql += "'" + name + "'"
if (i < goodStudentNames.length - 1){
sql += ","
}
i += 1
}
sql = sql + ")"
val goodStudentInfosDF = sqlContext.sql(sql)
val goodStudentsRDD = studentScoresDF.map(x => (x(0),x(1))).join(goodStudentInfosDF.map(x => (x(0),x(1))))
val goodStudentsRDDRow = goodStudentsRDD.map(x=> Row(x._1.toString, x._2._1.toString.toInt, x._2._2.toString.toInt))
var arr = Array(StructField("name",StringType,true)
,StructField("score",IntegerType,true)
,StructField("age",IntegerType,true))
val structType = StructType(arr)
val goodStudentDF = sqlContext.createDataFrame(goodStudentsRDDRow, structType)
goodStudentDF.write.format("json").mode(SaveMode.Overwrite).save("goodStudentJson")
}
}
Spark---Datasource(JSON)---Scala
最新推荐文章于 2024-10-13 16:12:28 发布