import pandas as pd
#.csv->.txt
data = pd.read_csv('/home/hadoop/us-counties.csv')
with open('/home/hadoop/us-counties.txt','a+',encoding='utf-8') as f:
for line in data.values:
f.write((str(line[0])+'\t'+str(line[1])+'\t'
+str(line[2])+'\t'+str(line[3])+'\t'+str(line[4])+'\n'))
from pyspark import SparkConf,SparkContext
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from datetime import date, datetime
from pyspark.sql import functions as F
from pyecharts.charts import Bar,Line,Grid,WordCloud
from pyecharts import options as opts
from pyecharts.components import Table
from pyecharts.charts import WordCloud
from pyecharts.options import ComponentTitleOpts
from pyecharts.charts import PictorialBar
from pyecharts.globals import SymbolType
from pyecharts.charts import Pie
from pyecharts.charts import Funnel
from pyecharts.charts import Scatter
def toDate(inputStr):
newStr = ""
if len(inputStr) == 8:
s1 = inputStr[0:4]
s2 = inputStr[5:6]
s3 = inputStr[7]
newStr = s1+"-"+"0"+s2+"-"+"0"+s3
else:
s1 = inputStr[0:4]
s2 = inputStr[5:6]
s3 = inputStr[7:]
newStr = s1+"-"+"0"+s2+"-"+s3
date = datetime.strptime(newStr, "%Y-%m-%d")
return date
spark = SparkSession.builder.config(conf = SparkConf()).getOrCreate()
fields = [StructField("date", DateType(),False),StructField("county", StringType(),False),StructField("state", StringType(),False),
StructField("cases", IntegerType(),False),StructField("deaths", IntegerType(),False),]
schema = StructType(fields)
rdd0 = spark.sparkContext.textFile("file:home/hadoop/us-counties.txt")
rdd1 = rdd0.map(lambda x:x.split("\t")).map(lambda p: Row(toDate(p[0]),p[1],p[2],int(p[3]),int(p[4])))
df = spark.createDataFrame(rdd1,schema)
df.createOrReplaceTempView("usInfo")
'''#1.计算每日的累计确诊病例数和死亡数
df1 = df.groupBy("date")\
.agg(F.sum("cases").alias("cases"),F.sum("deaths").alias("deaths"))\
.sort(df["date"].asc())
df1.repartition(1).createOrReplaceTempView("ustotal")
date_row=df1.select('date').collect()
date_list=[r.date for r in date_row]
cases_row=df1.select('cases').collect()
cases_list=[r.cases for r in cases_row]
deaths_row=df1.select('deaths').collect()
deaths_list=[r.deaths for r in deaths_row]
d=(
Bar()
.add_xaxis(date_list)
.add_yaxis("累计确诊人数", cases_list,stack='stack1')
.add_yaxis("累计死亡人数", deaths_list,stack='stack1')
.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
.set_global_opts(title_opts=opts.TitleOpts(title="美国每日累计确诊和死亡人数"))
.render("/home/hadoop/result/美国每日的累计确诊病例数和死亡数.html")
)
'''
""""df2 = spark.sql("select t1.date,t1.cases-t2.cases as caseIncrease,\
t1.deaths-t2.deaths as deathIncrease \
from ustotal t1,ustotal t2 \
where t1.date = date_add(t2.date,1)")
df2=df2.orderBy(df2["date"]).repartition(1)
date_row=df2.select('date').collect()
date_list=[r.date for r in date_row]
caseIncrease_row=df2.select('caseIncrease').collect()
caseIncrease_list=[r.caseIncrease for r in caseIncrease_row]
deathIncrease_row=df2.select('deathIncrease').collect()
deathIncrease_list=[r.deathIncrease for r in deathIncrease_row]"""
"""(
Line(init_opts=opts.InitOpts(width="1600px", height="800px"))
.add_xaxis(xaxis_data=date_list)
.add_yaxis(
series_name="新增确诊",
y_axis=caseIncrease_list,
)
.set_global_opts(
title_opts=opts.TitleOpts(title="美国每日新增确诊折线图", subtitle=""),
)
.render("/home/hadoop/result/美国每日新增确诊折线图.html")
)"""
""""(
Line(init_opts=opts.InitOpts(width="1600px", height="800px"))
.add_xaxis(xaxis_data=date_list)
.add_yaxis("新增死亡",
y_axis=deathIncrease_list,
)
.set_global_opts(
title_opts=opts.TitleOpts(title="美国每日新增死亡病例折线图"),
)
.render("/home/hadoop/result/美国每日新增死亡病例折线图.html")
)"""
df3=df.groupBy(df.state).\
agg(F.sum('cases').alias('totalCases'),\
F.sum('deaths').alias('totalDeaths'))
df3=df3.withColumn('deathRate',col=(df3['totalCases'] / df3['totalDeaths']))
df3=df3.sort(df3["totalCases"].desc()).repartition(1)
df3.createOrReplaceTempView("eachStateInfo")
# df3.limit(10).show()
# df9=df.groupBy('date','state').agg(F.sum('cases').alias('totalCases'))
# df9.createOrReplaceTempView("eachStateInfo")
# df9.limit(10).show()
# df9.createOrReplaceTempView("eachStateInfo")
# df9.limit(10)
'''state_row=df3.select('state','totalCases','totalDeaths','deathRate').collect()
arow=[i for i in state_row]
table = Table()
headers = ["State name", "Total cases", "Total deaths", "Death rate"]
table.add(headers, arow)
table.set_global_opts(
title_opts=ComponentTitleOpts(title="美国各州疫情一览")
)
table.render("/home/hadoop/result/美国各州疫情一览图.html")'''
"""df5=df3.select(df3.state,df3.totalDeaths)
df5=df5.orderBy(df5["totalDeaths"].desc()).limit(10)
#df5.show()
d_state_row=df5.select('state','totalDeaths').collect()
d_state_list=[r.state for r in d_state_row]
d_total_list=[r.totalDeaths for r in d_state_row]
# d_total_row=df5.select('totalDeaths').collect()
# d_total_list=[r.totalDeaths for r in d_total_row]
c = (
PictorialBar()
.add_xaxis(d_state_list)
.add_yaxis(
"",
d_total_list,
label_opts=opts.LabelOpts(is_show=False),
symbol_size=18,
symbol_repeat="fixed",
symbol_offset=[0, 0],
is_symbol_clip=True,
symbol=SymbolType.ROUND_RECT,
)
.reversal_axis()
.set_global_opts(
title_opts=opts.TitleOpts(title="PictorialBar-美国各州死亡人数Top10"),
)
.render("/home/hadoop/result/美国各州死亡人数Top10.html")
)"""
#4.找出美国确诊最多的10个州
"""df4=df3.select(df3.state,df3.totalCases)
df4=df4.orderBy(df4["totalCases"].desc()).limit(10)
state_row=df4.select('state').collect()
state_list=[r.state for r in state_row]
total_row=df4.select('totalCases').collect()
total_list=[r.totalCases for r in total_row]
m=list(zip(state_list,total_list))
wordcloud = WordCloud()
wordcloud.add("", m,word_size_range=[20, 100])
wordcloud.render("/home/hadoop/result/wordcount.html")
"""
""""df6 =df3.select(df3.state,df3.totalCases)
df6=df6.orderBy(df6["totalCases"].asc()).limit(10)
shao_state_row=df6.select('state').collect()
shao_state_list=[r.state for r in shao_state_row]
shao_total_row=df6.select('totalCases').collect()
shao_total_list=[r.totalCases for r in shao_total_row]
minm=list(zip(shao_state_list,shao_total_list))
wordcloud = WordCloud()
wordcloud.add("", minm,word_size_range=[20, 100])
wordcloud.render("/home/hadoop/result/shao_wordcount.html")"""
""""df7=df3.select(df3.state,df3.totalDeaths)
df7=df7.orderBy(df7["totalDeaths"].asc()).limit(10)
shaod_state_row=df7.select('state').collect()
shaod_state_list=[r.state for r in shaod_state_row]
shaod_total_row=df7.select('totalDeaths').collect()
shaod_total_list=[r.totalDeaths for r in shaod_total_row]
dmin=list(zip(shaod_state_list,shaod_total_list))
c = (
Funnel()
.add(
"State",
dmin,
sort_="ascending",
label_opts=opts.LabelOpts(position="inside"),
)
.set_global_opts(title_opts=opts.TitleOpts(title=""))
.render("/home/hadoop/result/漏斗图.html")
)"""
#8.统计截止5.19全美和各州的病死率
df8 = spark.sql("select 1 as sign,'USA' as state,\
round(sum(totalDeaths)/sum(totalCases),4) \
as deathRate from eachStateInfo \
union select 2 as sign,state,deathRate \
from eachStateInfo").cache()
df8=df8.sort(df8["sign"].asc(),df8["deathRate"].desc()).repartition(1)
values=[]
usa_dr=df8.select("deathRate").where(df8.state=='USA').collect()
usa_list=[r.deathRate for r in usa_dr]
death=usa_list[0]*100
nodeath=100-death
values.append(["Death(%)",round(float(death),2)])
values.append(["No-Death(%)",round(float(nodeath),2)])
c = (
Pie()
.add("", values)
.set_colors(["blcak","orange"])
.set_global_opts(title_opts=opts.TitleOpts(title="全美的病死率"))
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
.render("/home/hadoop/result/全美病死率的饼状图.html")
)