pyspark读取json文件清洗并写入json文件
from pyspark.sql import SparkSession
def getSqlAndSpark():
"""
获取SQL和Spark的对象, SQL的没写,暂时不用
:return:
"""
spark = SparkSession \
.builder \
.appName('microsoft') \
.getOrCreate()
return spark
def guolv(x):
"""
提取出name 为 Radio astronomy 的
:param x:
:return:
"""
name = x['key']
if name == 'Radio astronomy':
return x
def read_json(spark):
# data = spark.read.json('/data2/data.json')
data = spark.read.json('data.json')
data.foreach(lambda x: print(x))
data1 = data.rdd.map(lambda x: guolv(x))
data2 = data1.filter(lambda x: x != None)
schem_data = spark.createDataFrame(data2)
# schem_data.write.json('/data2/filter_data.json')
schem_data.write.json('filter_data.json')
def main():
spark = getSqlAndSpark()
read_json(spark)
if __name__ == '__main__':
main()