首先通过StructType指定字段和类型,然后再将RDD和schema信息相映射。
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
object StructTypeTest01 {
def main(args: Array[String]): Unit = {
//1.创建SparkSession对象
val spark: SparkSession = SparkSession.builder().appName("StructTypeTest01").master("local[*]").getOrCreate()
//2.创建sparkContext对象
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("warn")
//3.读入文件数据
val data: RDD[Array[String]] = sc.textFile(this.getClass.getClassLoader.getResource("person.txt").getPath).map(x => x.split(" "))
//4.RDD与Row对象关联
val rowRDD: RDD[Row] = data.map(x => Row(x(0), x(1), x(2).toInt))
//5.指定schema信息
val schema: StructType = StructType(
StructField("id", StringType) ::
StructField("name", StringType) ::
StructField("age", IntegerType) :: Nil
)
//6.生成dataFrame
val personDF: DataFrame = spark.createDataFrame(rowRDD, schema)
println("******************************************")
personDF.printSchema()
personDF.show()
personDF.createTempView("person")
spark.sql("select * from person order by age").show()
println("******************************************")
spark.stop()
}
}
–The End–