http://spark.apache.org/docs/latest/sql-getting-started.html
RDD转DataFrame/DataSet
处理文本格式必备
方式一:reflection(反射)
// RDD转DataFrame
import org.apache.spark.sql.SparkSession
object DataFrameRDDAPP {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder
.master("local")
.appName("DataFrameRDDAPP")
.getOrCreate()
//RDD缓存到DF时需要的隐式转换
import spark.implicits._
val RDD = spark.sparkContext.textFile("file:///D:\\bigdata\\ruozedata-spark\\ruozedata-spark-sql\\data\\data.txt")
val infoDf = RDD.map(x => {
val strings = x.split(",")
val id = strings(0).trim.toInt
val subType = strings(1).trim.toInt
val sub = strings(2)
val score = strings(3).trim.toInt
info(id, subType, sub, score)
}).toDF()
infoDf.printSchema()
infoDf.show()
spark.stop()
}
case class info(id:Int,subType:Int,sub:String,score:Int)
}
方式二:programmatically 编程模式,如果对接外部数据源肯定是选择这种
-
Create an RDD of Rows from the original RDD;
-
Create the schema represented by a StructType matching the structure of Rows in the RDD created in Step 1.
-
Apply the schema to the RDD of Rows via createDataFrame method provided by SparkSession.
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{ IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession}
object DataFrameRDDAPP {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder
.master("local")
.appName("DataFrameRDDAPP")
.getOrCreate()
programmatically(spark)
spark.stop()
}
def programmatically(spark:SparkSession): Unit ={
//STEP1:创建RDD RDD[String]===>RDD[Row]
val RDD = spark.sparkContext.textFile("file:///D:\\bigdata\\ruozedata-spark\\ruozedata-spark-sql\\data\\data.txt")
val value: RDD[Row] = RDD.map(x => {
val strings = x.split(",")
val id = strings(0).trim.toInt
val subType = strings(1).trim.toInt
val sub = strings(2)
val score = strings(3).trim.toInt
Row(id, subType, sub, score)
})
//STEP2:创建schema信息
val struct = StructType(
StructField("id", IntegerType, true) ::
StructField("subType", IntegerType, false) ::
StructField("sub", StringType, false) ::
StructField("score", IntegerType, false) :: Nil)
//STEP3:创建DataFrame
val df = spark.createDataFrame(value, struct)
df.show()
}
}
跨数据源数据join
--------------
Spark 1.X 有两个入口,SQLContext和HiveContext,在2.x之后只有一个入口SparkSession
如何构建外部数据源
欢迎关注公众号,一起愉快的交流