spark xml 明确的指定schema

最新推荐文章于 2020-11-03 23:20:26 发布

jxx4903049

最新推荐文章于 2020-11-03 23:20:26 发布

阅读量525

点赞数

分类专栏： spark 文章标签： spark

本文链接：https://blog.csdn.net/jxx4903049/article/details/106469168

版权

spark 专栏收录该内容

15 篇文章 0 订阅

订阅专栏

package com.vivo.study.xml
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types._
object ReadBooksXMLWithNestedArrayStruct {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().master("local[1]")
      .appName("langjian")
      .getOrCreate()
    val customSchema = StructType(Array(
      StructField("_id", StringType, nullable = true),
      StructField("author", StringType, nullable = true),
      StructField("description", StringType, nullable = true),
      StructField("genre", StringType ,nullable = true),
      StructField("price", DoubleType, nullable = true),
      StructField("publish_date", StringType, nullable = true),
      StructField("title", StringType, nullable = true),
      StructField("otherInfo",StructType(Array(
        StructField("pagesCount", StringType, nullable = true),
        StructField("language", StringType, nullable = true),
        StructField("country", StringType, nullable = true),
        StructField("address", StructType(Array(
          StructField("addressline1", StringType, nullable = true),
          StructField("city", StringType, nullable = true),
          StructField("state", StringType, nullable = true)
          ))
        ))
      )),
      StructField("stores",StructType(Array(
        StructField("store",ArrayType(
          StructType(Array(
            StructField("location",StringType,true),
            StructField("name",StringType,true)
          ))
        ))
      )))
    ))
    val df = spark.sqlContext.read
      .format("com.databricks.spark.xml")
      .option("rowTag", "book")
      .schema(customSchema)
      .load("data/books_complex.xml")
    df.printSchema()
    df.show()
    df.foreach(row=>{
      println(""+row.getAs("author")+","+row.getAs("_id"))
      println(row.getAs[GenericRowWithSchema]("otherInfo").getAs("country"))
      println(row.getStruct(7).getClass)
      val arr = row.getStruct(8).getList(0)
      for (i<-0 to arr.size-1){
        val b = arr.get(i).asInstanceOf[GenericRowWithSchema]
        println(""+b.getAs("name") +","+b.getAs("location"))
      }
    })
  }
}

对应的打印schema:

root
 |-- _id: string (nullable = true)
 |-- author: string (nullable = true)
 |-- description: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- price: double (nullable = true)
 |-- publish_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- otherInfo: struct (nullable = true)
 |    |-- pagesCount: string (nullable = true)
 |    |-- language: string (nullable = true)
 |    |-- country: string (nullable = true)
 |    |-- address: struct (nullable = true)
 |    |    |-- addressline1: string (nullable = true)
 |    |    |-- city: string (nullable = true)
 |    |    |-- state: string (nullable = true)
 |-- stores: struct (nullable = true)
 |    |-- store: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- location: string (nullable = true)
 |    |    |    |-- name: string (nullable = true)

<dependency>
     <groupId>com.databricks</groupId>
     <artifactId>spark-xml_2.11</artifactId>
     <version>0.9.0</version>
 </dependency>

jxx4903049

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
spark xml 明确的指定schema

package com.vivo.study.xmlimport org.apache.spark.sql.SparkSessionimport org.apache.spark.sql.catalyst.expressions.GenericRowWithSchemaimport org.apache.spark.sql.types._object ReadBooksXMLWithNestedArrayStruct { def main(args: Array[String]): Unit .
复制链接

扫一扫

专栏目录