读取csv文件-合并省、市、区 形成详细地址-解析经纬度-转换日期-保存成一个csv文件(linux服务器)
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, explode, concat, concat_ws, to_timestamp, to_date
#经纬度
import requests
import json
from pyspark.sql import functions
if __name__ == '__main__':
# 1)创建sparkContext对象
# conf = SparkConf().setMaster("local[*]").setAppName("merge")
# sc = SparkContext(conf = conf)
ss = SparkSession \
.builder \
.appName("merge") \
.getOrCreate()
# 2)读取数据
df= ss.read.options(header='True').csv("hdfs://hadoop102:8020/covid-19-data/riskdata.csv")
# df.printSchema()
df.show()
df_concat = df.withColumn("address", concat(df['province'], \
df ['city'], df['county'],
df ['street']))
# df_concat = df_concat.withColumn("longitude",functions.lit(0))
# df_concat = df_concat.withColumn("latitude", functions.lit(0))
longitude = []
latitude = []
df_concat.show()
rows = df_concat.collect()
for row in rows:
# print(row[6])
# 经纬度
# 输入API问号前固定不变的部分
url = 'https://restapi.amap.com/v3/geocode/geo'
# 将两个参数放入字典
params = {'key': 'gaodekey(自己申请)',
'address': row[6]}
res = requests.get(url, params)
# print(res.text)
# 输出结果为json,将其转为字典格式
jd = json.loads(res.text)
jingweidu = jd['geocodes'][0].get('location')
jingdu, weidu = jingweidu.split(",")
# print(jingdu)
# print(weidu)
longitude.append(jingdu)
latitude.append(weidu)
from pyspark.sql import functions as F
from pyspark.sql.window import Window
w = Window().partitionBy("province").orderBy("longitude")
df_concat=df_concat.withColumn("longitude", F.array(*[F.lit(x) for x in longitude])) \
.withColumn("rownum", F.row_number().over(w)) \
.withColumn("longitude", F.expr("""element_at(longitude,rownum)""")).drop("rownum")
df_concat = df_concat.withColumn("latitude", F.array(*[F.lit(x) for x in latitude])) \
.withColumn("rownum", F.row_number().over(w)) \
.withColumn("latitude", F.expr("""element_at(latitude,rownum)""")).drop("rownum")
df_concat.printSchema()
#转换数据类型
from pyspark.sql.types import DateType
from pyspark.sql.types import DoubleType
df1 = df_concat.withColumn("longitude", df_concat['longitude'].cast('double')) \
.withColumn('latitude', df_concat['latitude'].cast(DoubleType())) \
.withColumn("date",
F.from_unixtime(F.unix_timestamp("timestamp", "yyyy-M-d HH:mm"), "yyyy-MM-dd HH:mm:ss"))
#.withColumn("date", to_date(F.from_unixtime(F.unix_timestamp("timestamp","yyyy-M-d HH:mm"),"yyyy-MM-dd HH:mm"),"yyyy-MM-dd HH:mm"))
df1.show()
df1.printSchema()
df2 = df1.select(df1[0],df1[1],df1[2],df1[3],df1[4],df1[6],df1[7],df1[8],df1[9])
df2.show()
df2.printSchema()
res_pdf = df2.toPandas()
res_pdf.to_csv('foo1.csv',index=None)
# df2.write.repartition(1).mode('append').format('csv').option('header', True).save('hdfs://hadoop102:8020/output1')
# properties = {'driver': 'ru.yandex.clickhouse.ClickHouseDriver',
# "socket_timeout": "300000",
# "rewriteBatchedStatements": "true",
# "batchsize": "1000000",
# "numPartitions": "8",
# 'user': 'zny',
# 'password': '',
# 'isolationLevel': 'NONE'
# }
#
# df2.write.jdbc(url = 'jdbc:clickhouse://hadoop001:8123',table = 'covid-19',mode = 'append',properties=properties)
# 关闭资源
ss.stop()