Pyecahrts绘图

import pandas as pd
 
#.csv->.txt
data = pd.read_csv('/home/hadoop/us-counties.csv')
with open('/home/hadoop/us-counties.txt','a+',encoding='utf-8') as f:
    for line in data.values:
        f.write((str(line[0])+'\t'+str(line[1])+'\t'
                +str(line[2])+'\t'+str(line[3])+'\t'+str(line[4])+'\n'))

from pyspark import SparkConf,SparkContext
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from datetime import date, datetime
from pyspark.sql import functions as F

from pyecharts.charts import Bar,Line,Grid,WordCloud
from pyecharts import options as opts
from pyecharts.components import Table
from pyecharts.charts import WordCloud
from pyecharts.options import ComponentTitleOpts
from pyecharts.charts import PictorialBar
from pyecharts.globals import SymbolType
from pyecharts.charts import Pie
from pyecharts.charts import Funnel
from pyecharts.charts import Scatter

def toDate(inputStr):
    newStr = ""
    if len(inputStr) == 8:
        s1 = inputStr[0:4]
        s2 = inputStr[5:6]
        s3 = inputStr[7]
        newStr = s1+"-"+"0"+s2+"-"+"0"+s3
    else:
        s1 = inputStr[0:4]
        s2 = inputStr[5:6]
        s3 = inputStr[7:]
        newStr = s1+"-"+"0"+s2+"-"+s3
    date = datetime.strptime(newStr, "%Y-%m-%d")
    return date
 

spark = SparkSession.builder.config(conf = SparkConf()).getOrCreate()
fields = [StructField("date", DateType(),False),StructField("county", StringType(),False),StructField("state", StringType(),False),
                    StructField("cases", IntegerType(),False),StructField("deaths", IntegerType(),False),]
schema = StructType(fields)
 
rdd0 = spark.sparkContext.textFile("file:home/hadoop/us-counties.txt")
rdd1 = rdd0.map(lambda x:x.split("\t")).map(lambda p: Row(toDate(p[0]),p[1],p[2],int(p[3]),int(p[4])))

df = spark.createDataFrame(rdd1,schema)
df.createOrReplaceTempView("usInfo")

'''#1.计算每日的累计确诊病例数和死亡数
df1 = df.groupBy("date")\
        .agg(F.sum("cases").alias("cases"),F.sum("deaths").alias("deaths"))\
        .sort(df["date"].asc())

df1.repartition(1).createOrReplaceTempView("ustotal")
date_row=df1.select('date').collect()
date_list=[r.date for r in date_row]

cases_row=df1.select('cases').collect()
cases_list=[r.cases for r in cases_row]

deaths_row=df1.select('deaths').collect()
deaths_list=[r.deaths for r in deaths_row]

d=(
    Bar()
    .add_xaxis(date_list)
    .add_yaxis("累计确诊人数", cases_list,stack='stack1')
    .add_yaxis("累计死亡人数", deaths_list,stack='stack1')    
    .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
    .set_global_opts(title_opts=opts.TitleOpts(title="美国每日累计确诊和死亡人数"))
    .render("/home/hadoop/result/美国每日的累计确诊病例数和死亡数.html")
)
'''
""""df2 = spark.sql("select t1.date,t1.cases-t2.cases as caseIncrease,\
                        t1.deaths-t2.deaths as deathIncrease \
                            from ustotal t1,ustotal t2 \
                                where t1.date = date_add(t2.date,1)")

df2=df2.orderBy(df2["date"]).repartition(1)

date_row=df2.select('date').collect()
date_list=[r.date for r in date_row]

caseIncrease_row=df2.select('caseIncrease').collect()
caseIncrease_list=[r.caseIncrease for r in caseIncrease_row]

deathIncrease_row=df2.select('deathIncrease').collect()
deathIncrease_list=[r.deathIncrease for r in deathIncrease_row]"""

"""(
    Line(init_opts=opts.InitOpts(width="1600px", height="800px"))
    .add_xaxis(xaxis_data=date_list)
    .add_yaxis(
        series_name="新增确诊",
        y_axis=caseIncrease_list,
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(title="美国每日新增确诊折线图", subtitle=""),
    )
    .render("/home/hadoop/result/美国每日新增确诊折线图.html")
)"""
""""(
    Line(init_opts=opts.InitOpts(width="1600px", height="800px"))
    .add_xaxis(xaxis_data=date_list)
    .add_yaxis("新增死亡",
        y_axis=deathIncrease_list,
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(title="美国每日新增死亡病例折线图"),
    )
    .render("/home/hadoop/result/美国每日新增死亡病例折线图.html")
)"""

df3=df.groupBy(df.state).\
        agg(F.sum('cases').alias('totalCases'),\
            F.sum('deaths').alias('totalDeaths'))
df3=df3.withColumn('deathRate',col=(df3['totalCases'] / df3['totalDeaths']))
df3=df3.sort(df3["totalCases"].desc()).repartition(1)
df3.createOrReplaceTempView("eachStateInfo")

# df3.limit(10).show()

# df9=df.groupBy('date','state').agg(F.sum('cases').alias('totalCases'))
# df9.createOrReplaceTempView("eachStateInfo")
# df9.limit(10).show()
# df9.createOrReplaceTempView("eachStateInfo")
# df9.limit(10)
'''state_row=df3.select('state','totalCases','totalDeaths','deathRate').collect()
arow=[i for i in state_row]
table = Table()
headers = ["State name", "Total cases", "Total deaths", "Death rate"]
table.add(headers, arow)
table.set_global_opts(
        title_opts=ComponentTitleOpts(title="美国各州疫情一览")
)
table.render("/home/hadoop/result/美国各州疫情一览图.html")'''

"""df5=df3.select(df3.state,df3.totalDeaths)
df5=df5.orderBy(df5["totalDeaths"].desc()).limit(10)
#df5.show()
d_state_row=df5.select('state','totalDeaths').collect()
d_state_list=[r.state for r in d_state_row]
d_total_list=[r.totalDeaths for r in d_state_row]
# d_total_row=df5.select('totalDeaths').collect()
# d_total_list=[r.totalDeaths for r in d_total_row]
c = (
    PictorialBar()
    .add_xaxis(d_state_list)
    .add_yaxis(
        "",
        d_total_list,
        label_opts=opts.LabelOpts(is_show=False),
        symbol_size=18,
        symbol_repeat="fixed",
        symbol_offset=[0, 0],
        is_symbol_clip=True,
        symbol=SymbolType.ROUND_RECT,
    )
    .reversal_axis()
    .set_global_opts(
        title_opts=opts.TitleOpts(title="PictorialBar-美国各州死亡人数Top10"),
    )
    .render("/home/hadoop/result/美国各州死亡人数Top10.html")
)"""

#4.找出美国确诊最多的10个州
"""df4=df3.select(df3.state,df3.totalCases)
df4=df4.orderBy(df4["totalCases"].desc()).limit(10)

state_row=df4.select('state').collect()
state_list=[r.state for r in state_row]

total_row=df4.select('totalCases').collect()
total_list=[r.totalCases for r in total_row]
m=list(zip(state_list,total_list))

wordcloud = WordCloud()
wordcloud.add("", m,word_size_range=[20, 100])
wordcloud.render("/home/hadoop/result/wordcount.html")
"""

""""df6 =df3.select(df3.state,df3.totalCases)
df6=df6.orderBy(df6["totalCases"].asc()).limit(10)

shao_state_row=df6.select('state').collect()
shao_state_list=[r.state for r in shao_state_row]

shao_total_row=df6.select('totalCases').collect()
shao_total_list=[r.totalCases for r in shao_total_row]

minm=list(zip(shao_state_list,shao_total_list))

wordcloud = WordCloud()
wordcloud.add("", minm,word_size_range=[20, 100])
wordcloud.render("/home/hadoop/result/shao_wordcount.html")"""

""""df7=df3.select(df3.state,df3.totalDeaths)
df7=df7.orderBy(df7["totalDeaths"].asc()).limit(10)

shaod_state_row=df7.select('state').collect()
shaod_state_list=[r.state for r in shaod_state_row]
shaod_total_row=df7.select('totalDeaths').collect()
shaod_total_list=[r.totalDeaths for r in shaod_total_row]

dmin=list(zip(shaod_state_list,shaod_total_list))
c = (
    Funnel()
    .add(
        "State",
        dmin,
        sort_="ascending",
        label_opts=opts.LabelOpts(position="inside"),
    )
    .set_global_opts(title_opts=opts.TitleOpts(title=""))
    .render("/home/hadoop/result/漏斗图.html")
)"""

#8.统计截止5.19全美和各州的病死率
df8 = spark.sql("select 1 as sign,'USA' as state,\
                    round(sum(totalDeaths)/sum(totalCases),4) \
                         as deathRate from eachStateInfo \
                            union select 2 as sign,state,deathRate \
                                from eachStateInfo").cache()
df8=df8.sort(df8["sign"].asc(),df8["deathRate"].desc()).repartition(1)
values=[]
usa_dr=df8.select("deathRate").where(df8.state=='USA').collect()
usa_list=[r.deathRate for r in usa_dr]
death=usa_list[0]*100
nodeath=100-death
values.append(["Death(%)",round(float(death),2)])
values.append(["No-Death(%)",round(float(nodeath),2)])
c = (
    Pie()
    .add("", values)
    .set_colors(["blcak","orange"])
    .set_global_opts(title_opts=opts.TitleOpts(title="全美的病死率"))
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    .render("/home/hadoop/result/全美病死率的饼状图.html")
    )

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值