目标是准备这个我有一个架构,如下所示 . 如何解析嵌套对象并将其加载到HIVE表中,到目前为止我已经有了这段代码 . 运行Spark版本2.2.0.2.6.4.0-91我需要有关此编码的帮助,如果有人可以请求帮助我附加了初始代码 .
root
|-- CustData: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- TimeStamp: double (nullable = true)
| | |-- Value_x: double (nullable = true)
| | |-- Value_y: double (nullable = true)
| | |-- Value_z: double (nullable = true)
|-- Cust_ID: string (nullable = true)
|-- Deprt_ID: string (nullable = true)
|-- EndTime: double (nullable = true)
|-- EndTimeZone: string (nullable = true)
|-- Salesd: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- Salesd_Value1: long (nullable = true)
| | |-- Salesd_Value2: long (nullable = true)
| | |-- Salesd_Value3: double (nullable = true)
| | |-- Salesd_Value4: double (nullable = true)
|-- Cust_RespData: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- TimeStamp: double (nullable = true)
| | |-- Cust_RespData_val1: double (nullable = true)
| | |-- Cust_RespData_val1: double (nullable = true)
| | |-- Cust_RespData_val1: double (nullable = true)
| | |-- Cust_RespData_val1: double (nullable = true)
|-- Cust_RespData_ID: string (nullable = true)
#!/bin/python2
from pyspark import SparkContext
from pyspark.sql import SparkSession
##from pyspark.sql.functions import get_json_object
from pyspark.sql.functions import *
from pyspark.sql.types import StringType
import json
# define context
sc = SparkContext()
spark = SparkSession(sc)
# load sources
jsonFile = "hdfs://loaclhost/data/cust_salesd.json"
sd = spark.read.json(jsonFile)
sd.printSchema()
sdf = sd.select "CustData","Cust_ID","Deprt_ID","DriverID","EndTime","EndTimeZone",explode(col("Salesd").alias("Salesd_ROW")))
sdf.show()
sdf1 = sdf.select("CustData","Cust_ID","Deprt_ID","DriverID","EndTime","EndTimeZone", "Salesd_ROW.Salesd_Value1", "Salesd_ROW.Salesd_Value2", "Salesd_ROW.Salesd_Value3", "Salesd_ROW.Salesd_Value4")
sdf1.show()
## how can I load it to HIVE table ???
spark.stop()