解析json数组的一些操作~做个笔记,备查
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import *
spark = SparkSession.builder.appName("pysaprk").getOrCreate()
构造测试数据
import re
import json
import pandas as pd
json_str = '{"class":"三班","nums":40,"data":[{"name":"小明","age":10},{"name":"小红","age":9}],"data1":{"jack":10,"leo":9}}'
spark_df = spark.createDataFrame(pd.DataFrame({'x': [json_str]}))
spark_df.printSchema()
spark_df.show(truncate=False)
spark_df.createOrReplaceTempView('test')
root
|-- x: string (nullable = true)
+----------------------------------------------------------------------------------------------------------------+
|x |
+----------------------------------------------------------------------------------------------------------------+
|{"class":"三班","nums":40,"data":[{"name":"小明","age":10},{"name":"小红","age":9}],"data1":{"jack":10,"leo":9}}|
+----------------------------------------------------------------------------------------------------------------+
get_json_object和json_tuple的使用
简单k-v格式的提取,废话不多说,看码:
sql = """
SELECT
get_json_object(x,'$.class') as class,
get_json_object(x,'$.nums') as nums,
get_json_object(x,'$.data1.jack') as jack_age,
json_tuple(x,'class','nums','data1','data1.leo') as (class1,nums1,data1,jack_age1)
FROM test
"""
ddd = spark.sql(sql)
ddd.printSchema()
ddd.show(truncate=True)
root
|-- class: string (nullable = true)
|-- nums: string (nullable = true)
|-- jack_age: string (nullable = true)
|-- class1: string (nullable = true)
|-- nums1: string (nullable = true)
|-- data1: string (nullable = true)
|-- jack_age1: string (nullable = true)
+-----+----+--------+------+-----+-------------------+---------+
|class|nums|jack_age|class1|nums1| data1|jack_age1|
+-----+----+--------+------+-----+-------------------+---------+
| 三班| 40| 10| 三班| 40|{"jack":10,"leo":9}| null|
+-----+----+--------+------+-----+-------------------+---------+
- json_tuple只能解析第一层key,但能同时解析多个字段,少打点字儿
- get_json_object支持嵌套、取多层的操作
解析json数组
pd.DataFrame(json.loads(json_str)['data'])
name | age | |
---|---|---|
0 | 小明 | 10 |
1 | 小红 | 9 |
json数组一行转多行
json.loads(json_str)
{'class': '三班',
'nums': 40,
'data': [{'name': '小明', 'age': 10}, {'name': '小红', 'age': 9}],
'data1': {'jack': 10, 'leo': 9}}
json.loads(json_str)['data']
[{'name': '小明', 'age': 10}, {'name': '小红', 'age': 9}]
- 获取data元素,是个大字符串
- 正则替换,先把[]替换掉
- 再把
},{
替换成}_ff_{
,为下步按照_ff_
split做准备 - split之后是数组,explode一行变多行,每一行按照json解析处理
sql = """
SELECT
data,
j_column,
get_json_object(j_column,'$.name') as name,
get_json_object(j_column,'$.age') as age
FROM
(
SELECT
split(regexp_replace(regexp_replace(get_json_object(x,'$.data'), '\\\\[|\\\\]',''),'\\\\},\\\\{','}_ff_{'),'_ff_')AS data
FROM test
) as a
lateral view explode(data) b as j_column
"""
ddd = spark.sql(sql)
ddd.printSchema()
ddd.show(truncate=False)
root
|-- data: array (nullable = true)
| |-- element: string (containsNull = true)
|-- j_column: string (nullable = true)
|-- name: string (nullable = true)
|-- age: string (nullable = true)
+---------------------------------------------------+------------------------+----+---+
|data |j_column |name|age|
+---------------------------------------------------+------------------------+----+---+
|[{"name":"小明","age":10}, {"name":"小红","age":9}]|{"name":"小明","age":10}|小明|10 |
|[{"name":"小明","age":10}, {"name":"小红","age":9}]|{"name":"小红","age":9} |小红|9 |
+---------------------------------------------------+------------------------+----+---+
一组json的行转列
{'jack': 10, 'leo': 9}
行转列,且增加列名
sql = """
SELECT
j_column,
split(j_column,':')[0] as name,
split(j_column,':')[1] as age
FROM
(
SELECT
split(regexp_replace(get_json_object(x,'$.data1'), '\\\\{|\\\\}',''),',')AS data
FROM test
) as a
lateral view explode(data) b as j_column
"""
ddd = spark.sql(sql)
ddd.printSchema()
ddd.show(truncate=False)
root
|-- j_column: string (nullable = true)
|-- name: string (nullable = true)
|-- age: string (nullable = true)
+---------+------+---+
|j_column |name |age|
+---------+------+---+
|"jack":10|"jack"|10 |
|"leo":9 |"leo" |9 |
+---------+------+---+
按序解析json
json数组有多个元素,每组元素按序排列,解析之后顺序需一一对应,和数组元素顺序保持一致
- posexplode获取数组内元素顺序
- 根据顺序id关联即可
json_str_sort = '{"name": ["小明", "小红", "小白"], "age": [15, 16, 17]}'
spark_df_sort = spark.createDataFrame(pd.DataFrame({'x': [json_str_sort]}))
spark_df_sort.show(truncate=False)
+-------------------------------------------------------+
|x |
+-------------------------------------------------------+
|{"name": ["小明", "小红", "小白"], "age": [15, 16, 17]}|
+-------------------------------------------------------+
spark_df_sort.createOrReplaceTempView('test_sort')
sql = """
with json_table as (
SELECT
get_json_object(x,'$.name') AS name,
get_json_object(x,'$.age') AS age
FROM test_sort
)
select t1.name,t1.age,t1.pos1,t1.name1,t2.pos2,t2.age1
from
(SELECT *
FROM json_table
LATERAL VIEW posexplode(split(regexp_replace(name,'\\]|\\[|\\"',''),',')) tt1 AS pos1,name1
) as t1
left join
(SELECT *
FROM json_table
LATERAL VIEW posexplode(split(regexp_replace(age,'\\]|\\[|\\"',''),',')) tt2 AS pos2,age1
) as t2 ON t1.pos1=t2.pos2
order by t1.pos1
"""
sql = sql.replace("\\", '\\\\')
ddd1 = spark.sql(sql)
ddd1.printSchema()
ddd1.show(truncate=False)
root
|-- name: string (nullable = true)
|-- age: string (nullable = true)
|-- pos1: integer (nullable = false)
|-- name1: string (nullable = true)
|-- pos2: integer (nullable = true)
|-- age1: string (nullable = true)
+----------------------+----------+----+-----+----+----+
|name |age |pos1|name1|pos2|age1|
+----------------------+----------+----+-----+----+----+
|["小明","小红","小白"]|[15,16,17]|0 |小明 |0 |15 |
|["小明","小红","小白"]|[15,16,17]|1 |小红 |1 |16 |
|["小明","小红","小白"]|[15,16,17]|2 |小白 |2 |17 |
+----------------------+----------+----+-----+----+----+
常用的解析json场景,后续有新场景,再更
2022-11-26 阴 于南京市江宁区
update 2023-07-25 暴雨 于南京市江宁区