json
文件如下:
一.读取json文件加载DataFrame
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
class dataframetest {
}
object dataframetest{
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName( "Spark SQL DataFrame Operations").setMaster( "local" )
val sparkContext = new SparkContext(sparkConf)
val sqlContext = SparkSession.builder().appName("MysqlQueryDemo").master("local").getOrCreate()
val dataFrame = sqlContext.read.json("/disk4/workspaceidea/examplespark/people.json")
dataFrame.show()
}
}
结果如下:
二.写sql来加载DataFrame
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
class dataframetest {
}
object dataframetest{
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName( "Spark SQL DataFrame Operations").setMaster( "local" )
val sparkContext = new SparkContext(sparkConf)
val sqlContext = SparkSession.builder().appName("MysqlQueryDemo").master("local").getOrCreate()
val dataFrame1 = sqlContext.read.json("/disk4/workspaceidea/examplespark/people.json")
// 将DataFrame注册成临时表
dataFrame1.registerTempTable("t1")
val dataFrame2 = sqlContext.sql("select csid,manufacturer,bid,brand from t1 where bid>2")
dataFrame2.show()
}
}
结果如下:
一二区别:用 sqlContext
读取 json
文件加载 DataFrame
, DataFrame
会按照 ASCII码
排序。
三.读取json格式的RDD加载DataFrame
两者可以互相转换。
DataFrame
转成RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
class dataframetest {
}
object dataframetest{
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName( "Spark SQL DataFrame Operations").setMaster( "local" )
val sparkContext = new SparkContext(sparkConf)
val sqlContext = SparkSession.builder().appName("MysqlQueryDemo").master("local").getOrCreate()
val dataFrame1 = sqlContext.read.json("/disk4/workspaceidea/examplespark/people.json")
val rdd = dataFrame1.rdd
rdd.take(3).foreach(println)
}
}
结果如下:
RDD
转成DataFrame
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
class dataframetest {
}
object dataframetest{
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName( "Spark SQL DataFrame Operations").setMaster( "local" )
val sparkContext = new SparkContext(sparkConf)
val sqlContext = SparkSession.builder().appName("MysqlQueryDemo").master("local").getOrCreate()
val RDD = sparkContext.parallelize(Array("{\"name\":\"zhangsan\",\"age\":18}",
"{\"name\":\"lisi\",\"age\":19}",
"{\"name\":\"wangwu\",\"age\":20}"))
val dataFrame = sqlContext.read.json(RDD)
dataFrame.show()
//dataFrame.select("age").show(2)
}
}
结果如下:
先写这些