spark xml 明确的指定schema

package com.vivo.study.xml
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types._
object ReadBooksXMLWithNestedArrayStruct {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().master("local[1]")
      .appName("langjian")
      .getOrCreate()
    val customSchema = StructType(Array(
      StructField("_id", StringType, nullable = true),
      StructField("author", StringType, nullable = true),
      StructField("description", StringType, nullable = true),
      StructField("genre", StringType ,nullable = true),
      StructField("price", DoubleType, nullable = true),
      StructField("publish_date", StringType, nullable = true),
      StructField("title", StringType, nullable = true),
      StructField("otherInfo",StructType(Array(
        StructField("pagesCount", StringType, nullable = true),
        StructField("language", StringType, nullable = true),
        StructField("country", StringType, nullable = true),
        StructField("address", StructType(Array(
          StructField("addressline1", StringType, nullable = true),
          StructField("city", StringType, nullable = true),
          StructField("state", StringType, nullable = true)
          ))
        ))
      )),
      StructField("stores",StructType(Array(
        StructField("store",ArrayType(
          StructType(Array(
            StructField("location",StringType,true),
            StructField("name",StringType,true)
          ))
        ))
      )))
    ))
    val df = spark.sqlContext.read
      .format("com.databricks.spark.xml")
      .option("rowTag", "book")
      .schema(customSchema)
      .load("data/books_complex.xml")
    df.printSchema()
    df.show()
    df.foreach(row=>{
      println(""+row.getAs("author")+","+row.getAs("_id"))
      println(row.getAs[GenericRowWithSchema]("otherInfo").getAs("country"))
      println(row.getStruct(7).getClass)
      val arr = row.getStruct(8).getList(0)
      for (i<-0 to arr.size-1){
        val b = arr.get(i).asInstanceOf[GenericRowWithSchema]
        println(""+b.getAs("name") +","+b.getAs("location"))
      }
    })
  }
}

对应的打印schema:

root
 |-- _id: string (nullable = true)
 |-- author: string (nullable = true)
 |-- description: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- price: double (nullable = true)
 |-- publish_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- otherInfo: struct (nullable = true)
 |    |-- pagesCount: string (nullable = true)
 |    |-- language: string (nullable = true)
 |    |-- country: string (nullable = true)
 |    |-- address: struct (nullable = true)
 |    |    |-- addressline1: string (nullable = true)
 |    |    |-- city: string (nullable = true)
 |    |    |-- state: string (nullable = true)
 |-- stores: struct (nullable = true)
 |    |-- store: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- location: string (nullable = true)
 |    |    |    |-- name: string (nullable = true)
<dependency>
     <groupId>com.databricks</groupId>
     <artifactId>spark-xml_2.11</artifactId>
     <version>0.9.0</version>
 </dependency>

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值