加载外部数据源 users.txt:
anne 22 NY
joe 39 CO
alison 35 NY
mike 69 VA
marie 27 OR
jim 21 OR
bob 71 CA
mary 53 NY
dave 36 VA
dude 50 CA
通过自定义方式创建DataFrame:
1.从原来的RDD创建一个Row格式的RDD
2.创建与RDD中Rows结构匹配的StructType,通过该StructType创建表示RDD的Schema
3.通过SparkSession提供的createDataFrame方法创建DataFrame,方法参数为RDD的Schema
案例说明:
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession}
object sparkSqlTest {
def main(args: Array[String]): Unit = {
val sparksession = SparkSession.builder().appName("sparkSQL").master("local").getOrCreate()
val rdd = sparksession.sparkContext.textFile("file:///d:/测试数据/users.txt")
//----------官方文档案例----------
//step1:从原来的RDD创建一个Row格式的RDD
val rdd_row = rdd.map(x=>x.split(" ")).map(x=>Row(x(0),x(1),x(2)))
//step2:官方文档
val schemaString = "name age address"
//Generate the schema based on the string of schema
val fields = schemaString.split(" ")
.map(fieldName => StructField(fieldName, StringType, nullable = true))
val schema = StructType(fields)
//----------字段非string匹配----------
//step1:从原来的RDD创建一个Row格式的RDD
val rdd_row = rdd.map(x=>x.split(" ")).map(x=>Row(x(0),x(1).toInt,x(2)))
//step2:创建与RDD中Rows结构匹配的StructType(Int or String),通过该StructType创建表示RDD的Schema
val fields = List(
StructField("name", StringType, nullable = true),
StructField("age", IntegerType, nullable = true),
StructField("address", StringType, nullable = true)
)
val schema = StructType(fields)
//val schema = StructType(Array(StructField(fields(0),StringType),StructField(fields(1),IntegerType)))
//step3.通过SparkSession提供的createDataFrame方法创建DataFrame,方法参数为RDD的Schema
val rdd_df = sparksession.createDataFrame(rdd_row,schema)
//显示表中数据
rdd_df.show
// Creates a temporary view using the DataFrame rdd_df.createOrReplaceTempView("people")
// SQL can be run over a temporary view created using DataFrames
val results = sparksession.sql("SELECT name,age,address FROM people")
// SQL查询的结果是DataFrames并支持所有正常的RDD操作
// 结果中的行列可以通过字段索引或字段名称访问
import sparksession.implicits._
results.map({
attributes => "datas: " + (attributes(0),attributes(1),
attributes.getAs[String]("address"))
}).show
}
}
------加载外部数据源:
1.Json格式
测试代码:数据说明,数据来自高德地图关于小学附近的Pois(数据),数据源为sample1.json
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("sparkSQL").master("local").getOrCreate()
import spark.implicits._
val df_json = spark.read.json("file:///d:/测试数据/sample1.json")
//加载Json格式后&#