写在前面:所谓进阶,就是代码多几条,图像更花里胡哨的意思。但仍有这篇,是希望免去查帮助文档的步骤。
文章目录
内容介绍:
VIN (1-10)': The 1st 10 characters of each vehicle's Vehicle Identification Number (VIN).
'County': The county in which the registered owner resides.
'City': The city in which the registered owner resides
'State': The state in which the registered owner resides
'Postal Code': The 5 digit zip code in which the registered owner resides
'Model Year': The model year of the vehicle, determined by decoding the Vehicle Identification Number (VIN)
'Make': The manufacturer of the vehicle, determined by decoding the Vehicle Identification Number (VIN)
'Model': The model of the vehicle, determined by decoding the Vehicle Identification Number (VIN).
'Electric Vehicle Type': This distinguishes the vehicle as all electric or a plug-in hybrid.
'Clean Alternative Fuel Vehicle (CAFV) Eligibility': This categorizes vehicle as Clean Alternative Fuel Vehicles (CAFVs) based on the fuel requirement and electric-only range requirement in House Bill 2042 as passed in the 2019 legislative session.
'Electric Range': Describes how far a vehicle can travel purely on its electric charge.
'Base MSRP': This is the lowest Manufacturer's Suggested Retail Price (MSRP) for any trim level of the model in question.
'Legislative District': The specific section of Washington State that the vehicle's owner resides in, as represented in the state legislature.
'DOL Vehicle ID': Unique number assigned to each vehicle by Department of Licensing for identification purposes.
'Vehicle Location': The center of the ZIP Code for the registered vehicle.
'Electric Utility': This is the electric power retail service territories serving the address of the registered vehicle.
准备工作:
写两个常用计数的自定义函数:
(自定义后,自动补全更容易实现)
def vcounts(a):
return a.value_counts()
def group_mean(a,b,c):
return a.groupby(b)[c].mean()
一、清洗数据
1.简化信息
(1)删除无用信息
#df.State.value_counts() WA 181060 其他异地登记就先不看了
df=df[df['State']=='WA']
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 181060 entries, 0 to 181457
Data columns (total 17 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 VIN (1-10) 181060 non-null object
1 County 181060 non-null object
2 City 181060 non-null object
3 State 181060 non-null object
4 Postal Code 181060 non-null float64
5 Model Year 181060 non-null int64
6 Make 181060 non-null object
7 Model 181060 non-null object
8 Electric Vehicle Type 181060 non-null object
9 Clean Alternative Fuel Vehicle (CAFV) Eligibility 181060 non-null object
10 Electric Range 181060 non-null int64
11 Base MSRP 181060 non-null int64
12 Legislative District 181060 non-null float64
13 DOL Vehicle ID 181060 non-null int64
14 Vehicle Location 181055 non-null object
15 Electric Utility 181060 non-null object
16 2020 Census Tract 181060 non-null float64
dtypes: float64(3), int64(4), object(10)
memory usage: 24.9+ MB
nouse=['VIN (1-10)','Postal Code','2020 Census Tract','Legislative District','Base MSRP']
dta=df.drop(df[nouse],axis=1)
(2)删除缺失值
df=df.dropna()
2.重命名简化内容
先看复杂列名里有什么:
dta['Clean Alternative Fuel Vehicle (CAFV) Eligibility'].value_counts()
结果:
Clean Alternative Fuel Vehicle (CAFV) Eligibility
Eligibility unknown as battery range has not been researched 94566
Clean Alternative Fuel Vehicle Eligible 66646
Not eligible due to low battery range 19843
dta.rename(columns={'Clean Alternative Fuel Vehicle (CAFV) Eligibility':'CAFV'},inplace=True) #使用字典映射
dta['isCAFV']=dta['CAFV'].apply(lambda x:'unknown' if x=='Eligibility unknown as battery range has not been researched'
else 'CAFV' if x=='Clean Alternative Fuel Vehicle Eligible'
else 'NOT')
dta['isCAFV'].value_counts()#验证
另一列dta[‘Electric Vehicle Type’]同理
datetime
dta['year']=pd.to_datetime(dta['Model Year'],format='%Y').dt.year #时间格式练习,可不用 #因只有year,需要声明
二、可视化
1.电车拥有量前十的县
希望在柱状图里嵌套个饼状图(后期)
county_top=vcounts(dta['County'])[0:10];county_top
county_top_pair=[(k,v) for k,v in county_top.items()];county_top_pair
#推导式封装
from pyecharts.charts import Bar
county_bar=(Bar(init_opts=opts.InitOpts(bg_color='rgb(255,255,255)',width='950px',animation_opts=opts.AnimationOpts(animation=False))
)#默认是900*500px,增加50,使指示线显示完整
.add_xaxis(county_top.index.tolist())
.add_yaxis('',county_top.values.tolist())
.set_global_opts(
title_opts=(opts.TitleOpts(title='电车拥有量前十的县',pos_left='center',
title_textstyle_opts=(opts.TextStyleOpts(font_size=24)))),
#title→(title_textstyleopts)TextSt……
xaxis_opts=(opts.AxisOpts(axislabel_opts=(opts.LabelOpts(rotate=15))))#axis→axislabel
)
.set_series_opts(label_opts=opts.LabelOpts(position='top',font_size=14),#数据显示位置
markline_opts=opts.MarkLineOpts(data=[opts.MarkLineItem(name='平均值:',type_='average')],
label_opts=opts.LabelOpts(formatter='{b}:{c}')
#设置name和formatter,以显示指示线意义
)
)
)
county_bar.render('county_bar.html')
from pyecharts.charts import Pie
county_pie = (Pie(init_opts=opts.InitOpts(animation_opts=opts.AnimationOpts(animation=False))
)#希望贴在bar上面,所以不设置白色背景了
.add('',county_top_pair,start_angle=110,radius=['20%','50%']) #起始角度,为了指示线不和图例挨太进
.set_global_opts(legend_opts=opts.LegendOpts(is_show=False))
#调试图例:pos_left='80%',pos_top="50%",orient='vertical'干脆关了
.set_series_opts(label_opts=opts.LabelOpts(formatter='{b}:{d}%'))#设置展示格式
)
2.三大电车城市
city=vcounts(dta['City'])[0:3]
city_pair=[(k,v) for k,v in city.items()];city_pair
city_pie=(Pie(init_opts=opts.InitOpts(bg_color='rgb(255,255,255)',animation_opts=opts.AnimationOpts(animation=False)))
.add('',data_pair=city_pair,radius=['20%','60%'],rosetype='radius',label_line_opts=opts.PieLabelLineOpts(length=10),
#调整指示线长短,靠饼状图是length
#radius,面积与半径都体现大小关系
)
.set_global_opts(title_opts=opts.TitleOpts(title='华盛顿三大电车城市',pos_left='center',
title_textstyle_opts=opts.TextStyleOpts(font_size=24)),
legend_opts=opts.LegendOpts(is_show=False)
)
.set_series_opts(label_opts=opts.LabelOpts(formatter='{b}:{c},{d}%')#标签名,数值,百分数自己手动+%
)
)
3.历年电车数量
model=vcounts(dta['year']).sort_index();model
#sort_index按时间先后
from pyecharts.charts import Line
year_line = (Line(init_opts=opts.InitOpts(bg_color='rgb(255,255,255)',width='1000px',
animation_opts=opts.AnimationOpts(animation=False)))
.add_xaxis(model.index.astype(str).tolist())#astype保证使字符串
.add_yaxis('',model.values.tolist(),
markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_='min'),opts.MarkPointItem(type_='max')],
symbol_size=80))#设置标记大小,80能遮住label,又清楚
.set_global_opts(title_opts=opts.TitleOpts(title='1997-2024华盛顿州电车持有量',pos_left='center',
title_textstyle_opts=(opts.TextStyleOpts(font_size=24))),
xaxis_opts=opts.AxisOpts(splitline_opts=opts.SplitLineOpts(is_show=False))
)
)
)
4.电车市场品牌分析
band=vcounts(dta['Make'])[0:10];band
band_bar=(Bar(init_opts=opts.InitOpts(bg_color='#fff',width='1500px',animation_opts=opts.AnimationOpts(animation=False)))
.add_xaxis(band.index.tolist())
.add_yaxis('',band.values.tolist(),color='limegreen',markline_opts=opts.MarkLineOpts(data=[opts.MarkLineItem(name='平均值',type_='average')],
label_opts=opts.LabelOpts(formatter='{b}:{c}')))#markline中
#limegreen柠檬绿~
.set_global_opts(title_opts=opts.TitleOpts(title='十大最受欢迎电车品牌',pos_left='center',
title_textstyle_opts=opts.TextStyleOpts(font_size=24)),
xaxis_opts=opts.AxisOpts(splitline_opts=opts.SplitLineOpts(is_show=False),max_=9,
axisline_opts=opts.AxisLineOpts(symbol=['none','arrow'])),
#结合width和max_使x标签在不旋转的情况下显示完全,max_=(n-1)
yaxis_opts=opts.AxisOpts(axisline_opts=opts.AxisLineOpts(symbol=['none','arrow'])))
#axisline末端显示箭头
.set_series_opts(label_opts=opts.LabelOpts(position='top',font_size=14))
)
5.特斯拉的发展(使用overlap)
masike=dta[dta.Make=='TESLA']#布尔筛选
masike_che=vcounts(masike['year']).sort_index()[0:12]#21年后缺数据
masike_range=group_mean(masike,'year','Electric Range')[0:12].round(0)
x=masike_che.index.astype(str).tolist();x
from pyecharts.commons.utils import JsCode
from pyecharts.charts import Line
masike_line=(Line(init_opts=opts.InitOpts(bg_color='rgb(255,255,255)',animation_opts=opts.AnimationOpts(animation=False),width='1000px'))
.add_xaxis(x)
.add_yaxis('平均电池里程',masike_range.values.tolist(),
markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_='max',symbol_size=80)])
)
.extend_axis(yaxis=opts.AxisOpts(name='销量:辆',type_='value')
#extend_axis增加轴,并为轴命名
)
.set_global_opts(title_opts=(opts.TitleOpts(title='2008-2020年特斯拉销量与平均电池里程',pos_left='center',
title_textstyle_opts=(opts.TextStyleOpts(font_size=24)))),
xaxis_opts=opts.AxisOpts(name='时间:年',name_location='middle',name_gap=18),
yaxis_opts=opts.AxisOpts(name='里程:km',splitline_opts=opts.SplitLineOpts(is_show=False),max_=350),
#抛出一点上限,使markpoint不挡着着其他部分
legend_opts=opts.LegendOpts(pos_top='8%')#调整legend,不与标题重合
#留点距离,与label
)
)
color_func='''
function (params) {
if (params.name==2018){
return 'crimson';
}
return 'limegreen';
}
'''
masike_bar=(Bar(init_opts=opts.InitOpts(animation_opts=opts.AnimationOpts(animation=False)))
.add_xaxis(x)
.add_yaxis('销量',masike_che.values.tolist(),yaxis_index=1,itemstyle_opts=opts.ItemStyleOpts(color=JsCode(color_func))
)
.set_series_opts(label_opts=opts.LabelOpts(position='top',font_size=14)
)
)
# masike_line.overlap(masike_bar).render('maike_bar_line.html') #【只能overlap一次!】
#通过对比得出两点结论:
#1.line始终在bar图层下面
#2.当两个y相差甚大时,用【小的】做主轴
6.动力与清洁模式
counts_power=vcounts(dta.PowerType)
counts_power_pair=[(k,v) for k,v in counts_power.items()];counts_power_pair
counts_clean=vcounts(dta.isCAFV)
counts_clean_pair=[(k,v) for k,v in counts_clean.items()];counts_clean_pair
power_clean=(Pie(init_opts=opts.InitOpts(animation_opts=opts.AnimationOpts(animation=False),theme=ThemeType.SHINE,bg_color="#fff"))
#SHINE一个明亮的主题
.add('',counts_power_pair,radius=['10%','40%'],center=['30%','40%'],itemstyle_opts=opts.ItemStyleOpts(border_color='#fff')
)
.add('',counts_clean_pair,radius=['10%','40%'],center=['70%','40%'],itemstyle_opts=opts.ItemStyleOpts(border_color='#fff')
)
.set_global_opts(legend_opts=opts.LegendOpts(is_show=False),
title_opts=opts.TitleOpts(title='动力vs清洁能源',pos_left='center',
title_textstyle_opts=opts.TextStyleOpts(font_weight='bold',
font_size=24)#字体粗细
)
)
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}:\n {c}, {d}%")) #提行,美观一点
)