1 引入
主题:美国2020年各县总统选取数据。
来源:https://www.kaggle.com/datasets/unanimad/us-election-2020
目的:分别分析县与州级别的选取结果。
2 分析
2.1 库引入
import numpy as np
import pandas as pd
import datetime as dt
import plotly.graph_objects as go
import plotly.io as pio
import plotly.express as px
2.2 总票数
"""读取县级别选取文件"""
president_county_candidate = pd.read_csv("D:/Data/test/us_election/president_county_candidate.csv")
"""获取总投票数"""
# 按照列候选人分组
total_votes = president_county_candidate.groupby("candidate")
# 选取总投票数这一列并求和
total_votes = total_votes["total_votes"].sum()
# 添加索引列
total_votes = total_votes.reset_index()
# 根据总投票数降序排列
total_votes_descend = total_votes.sort_values(by="total_votes", ascending=False)
# 根据总投票数升序排列
total_votes_ascend = total_votes.sort_values(by="total_votes")
"""绘图"""
fig = px.bar(total_votes_ascend, x="total_votes", y="candidate")
fig.update_layout(template="simple_white", height=800)
fig.show()
输出如下:
当然,由于现在候选人之间的选票数差异巨大,所以换一种显示方式:
"""换一种方式"""
# 添加候选人名称及选票,升序排列
total_votes_ascend["text"] = total_votes_ascend.apply(
lambda x: x["candidate"] + ' ' + "{:,}".format(x['total_votes']), axis=1)
print(total_votes_ascend["text"])
fig = px.scatter(total_votes_ascend, x='total_votes', y='candidate', log_x=True, text="text")
fig.update_traces(textposition='middle right')
fig.update_layout(template='simple_white', height=800, yaxis=dict(showticklabels=False))
fig.show()
输出如下:
2.3 县级选举获胜数量
# 找到每个县中的获胜者
filter_won = president_county_candidate["won"]
# 定位到获胜者对应的县级数据
county_won = president_county_candidate.loc[filter_won]
# 统计每个候选者赢得选举的县的数量
candidate_county_won = county_won.groupby("candidate")["county"].count().reset_index()
print(candidate_county_won)
输出如下:
candidate county
0 Write-ins 3
1 Donald Trump 3219
2 Jo Jorgensen 1
3 Joe Biden 1410
川普赢得了一大半县的支持,却没有赢得普选😂
2.4 拜登川普得票数差异
拜登的:
# 找到拜登和川普的结果
biden_trump = president_county_candidate[president_county_candidate["candidate"].isin(['Joe Biden','Donald Trump'])]
# 获取他们俩在每个周每个县中的选票数量
biden_trump = biden_trump.pivot_table(index=["state", "county"], columns="candidate", values="total_votes", aggfunc="sum")
# 计算两人的选票比例
biden_trump["Donald Trump %"] = biden_trump["Donald Trump"] / (biden_trump["Donald Trump"] + biden_trump["Joe Biden"])
biden_trump["Joe Biden %"] = biden_trump["Joe Biden"] / (biden_trump["Donald Trump"] + biden_trump["Joe Biden"])
# 计算两人的差距
biden_trump["margin"] = biden_trump["Joe Biden %"] - biden_trump["Donald Trump %"]
# 按照差距的大小排列
biden_margin = biden_trump.sort_values(by='margin')
print(biden_trump.head(10))
输出如下:
candidate Donald Trump Joe Biden ... Joe Biden % margin
state county ...
Alabama Autauga County 19838 7503 ... 0.274423 -0.451154
Baldwin County 83544 24578 ... 0.227317 -0.545365
Barbour County 5622 4816 ... 0.461391 -0.077218
Bibb County 7525 1986 ... 0.208811 -0.582378
Blount County 24711 2640 ... 0.096523 -0.806954
Bullock County 1146 3446 ... 0.750436 0.500871
Butler County 5458 3965 ... 0.420779 -0.158442
Calhoun County 35101 15216 ... 0.302403 -0.395194
Chambers County 8753 6365 ... 0.421021 -0.157957
Cherokee County 10583 1624 ... 0.133038 -0.733923
[10 rows x 5 columns]
川普的:
trump_margin = biden_trump.sort_values(by="margin", ascending=False)
print(trump_margin.head(10))
输出如下:
candidate Donald Trump ... margin
state county ...
Maine Hersey 0 ... 1.000000
New Hampshire Dixville 0 ... 1.000000
District of Columbia Ward 7 1134 ... 0.939546
Ward 8 1085 ... 0.932149
Ward 5 1769 ... 0.921533
District of Columbia 1725 ... 0.915371
Ward 4 1913 ... 0.913833
Massachusetts Cambridge 3519 ... 0.869065
Provincetown 182 ... 0.859133
District of Columbia Ward 6 4337 ... 0.857934
[10 rows x 5 columns]
2.5 绘制县级选取结果
首先需要再去下载一个美国县级信息数据:
https://www.kaggle.com/code/jjmewtw/us-elections-study-with-enriched-data-estimation?select=us_county.csv
# 载入县级信息数据,包括经纬度等
county_fips = pd.read_csv("D:/Data/test/us_election/us_county.csv", dtype={"fips": "str"})
# 使用插入的方式合并
president_county_fips = pd.merge(president_county_candidate,
county_fips,
how="inner", left_on=["state", "county"], right_on=["state", "county"])
# 设置颜色
president_county_fips["color"] = president_county_fips.apply(
lambda x: "blue" if (x["party"] == "DEM") & (x["won"] == True) else "red", axis=1)
# 将颜色对应到每一个县
county_fips_color = president_county_fips[president_county_fips['won'] == True]
# 存储文件
# county_fips_color.to_csv("D:/Data/test/us_election/president_counties.csv")
county_fips_color['fips_2'] = county_fips_color['fips'].apply(lambda x: '0' + x if len(x) == 4 else x)
# 载入地图
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
counties = json.load(response)
"""画图"""
fig = px.choropleth(county_fips_color, geojson=counties, locations='fips_2', color='color',
scope="usa",
hover_data=["state", "county", "candidate"])
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0}, showlegend=False)
fig.show()
输出如下:
引用
【1】选举分析:https://www.kaggle.com/code/pauldesalvo/2020-election-analysis