阿里天池龙珠训练营python数据分析项目
用Pandas揭秘美国选民的总统喜好
链接https://tianchi.aliyun.com/competition/entrance/531837/introduction?spm=5176.19700039.J_9059755190.8.31b83ff4xsaTH1
import pandas as pd
#读取并命名 names为每列数据命名
candidates = pd.read_csv("./president_data/weball20.txt",sep = '|',names=['CAND_ID','CAND_NAME','CAND_ICI','PTY_CD','CAND_PTY_AFFILIATION','TTL_RECEIPTS',
'TRANS_FROM_AUTH','TTL_DISB','TRANS_TO_AUTH','COH_BOP','COH_COP','CAND_CONTRIB',
'CAND_LOANS','OTHER_LOANS','CAND_LOAN_REPAY','OTHER_LOAN_REPAY','DEBTS_OWED_BY',
'TTL_INDIV_CONTRIB','CAND_OFFICE_ST','CAND_OFFICE_DISTRICT','SPEC_ELECTION','PRIM_ELECTION','RUN_ELECTION'
,'GEN_ELECTION','GEN_ELECTION_PRECENT','OTHER_POL_CMTE_CONTRIB','POL_PTY_CONTRIB',
'CVG_END_DT','INDIV_REFUNDS','CMTE_REFUNDS'])
ccl = pd.read_csv("./president_data/ccl.txt",sep='|',names=['CAND_ID','CAND_ELECTION_YR','FEC_ELECTION_YR','CMTE_ID','CMTE_TP','CMTE_DSGN','LINKAGE_ID'])
ccl = pd.merge(ccl,candidates)
#提取需要的列使用pandas中的DataFrame结构
#委员会ID 候选人ID 姓名 党派
ccl = pd.DataFrame(ccl,columns=['CMTE_ID','CAND_ID', 'CAND_NAME','CAND_PTY_AFFILIATION'])
ccl.head(10)
| CMTE_ID | CAND_ID | CAND_NAME | CAND_PTY_AFFILIATION |
---|
0 | C00697789 | H0AL01055 | CARL, JERRY LEE, JR | REP |
---|
1 | C00701557 | H0AL01063 | LAMBERT, DOUGLAS WESTLEY III | REP |
---|
2 | C00701409 | H0AL01071 | PRINGLE, CHRISTOPHER PAUL | REP |
---|
3 | C00703066 | H0AL01089 | HIGHTOWER, BILL | REP |
---|
4 | C00708867 | H0AL01097 | AVERHART, JAMES | DEM |
---|
5 | C00710947 | H0AL01105 | GARDNER, KIANI A | DEM |
---|
6 | C00722512 | H0AL01121 | CASTORANI, JOHN | REP |
---|
7 | C00725069 | H0AL01139 | COLLINS, FREDERICK G. RICK' | DEM |
---|
8 | C00462143 | H0AL02087 | ROBY, MARTHA | REP |
---|
9 | C00493783 | H0AL02087 | ROBY, MARTHA | REP |
---|
itcont = pd.read_csv("./president_data/itcont_2020_20200722_20200820.txt",sep='|',names=['CMTE_ID','AMNDT_IND','RPT_TP','TRANSACTION_PGI',
'IMAGE_NUM','TRANSACTION_TP','ENTITY_TP','NAME','CITY',
'STATE','ZIP_CODE','EMPLOYER','OCCUPATION','TRANSACTION_DT',
'TRANSACTION_AMT','OTHER_ID','TRAN_ID','FILE_NUM','MEMO_CD',
'MEMO_TEXT','SUB_ID'])
citcont = pd.merge(ccl,itcont)
#接受捐赠的候选人姓名 捐赠人姓名 所在州 所在公司 职业 数额(美元) 收到捐款的日期 候选人党派
citcont = pd.DataFrame(citcont,columns=['CAND_NAME','NAME', 'STATE','EMPLOYER','OCCUPATION',
'TRANSACTION_AMT', 'TRANSACTION_DT','CAND_PTY_AFFILIATION'])
citcont.head(10)
| CAND_NAME | NAME | STATE | EMPLOYER | OCCUPATION | TRANSACTION_AMT | TRANSACTION_DT | CAND_PTY_AFFILIATION |
---|
0 | MORGAN, JOSEPH DAVID | MARTIN, WILLIAM II | AZ | RETIRED | RETIRED | 100 | 7242020 | REP |
---|
1 | MORGAN, JOSEPH DAVID | RODRIGUEZ, GERARDO | AZ | VA HOSPITAL | LAB TECH | 40 | 7242020 | REP |
---|
2 | MORGAN, JOSEPH DAVID | RODRIGUEZ, GERARDO | AZ | VA HOSPITAL | LAB TECH | 40 | 7312020 | REP |
---|
3 | WOOD, DANIEL | HOPKINS, RICHARD | AZ | POWERS-LEAVITT | INSURANCE AGENT | 300 | 8102020 | REP |
---|
4 | WOOD, DANIEL | PENDLETON, DIANE | AZ | UNEMPLOYED | NaN | 500 | 8072020 | REP |
---|
5 | WOOD, DANIEL | PREVATT, WILLIAM | AZ | SELF-EMPLOYED | DVM | 500 | 7312020 | REP |
---|
6 | WOOD, DANIEL | HARDING, DOUG | AZ | MICROSURE | OPERATIONS MANAGER | 2800 | 8102020 | REP |
---|
7 | WOOD, DANIEL | HARDING, MARI | AZ | NaN | NaN | 1400 | 8152020 | REP |
---|
8 | WOOD, DANIEL | HEDGER, CYNTHIA | TX | NaN | NaN | 200 | 7312020 | REP |
---|
9 | HUANG, PEGGY | HUANG - PERSONAL FUNDS, PEGGY | CA | OFFICE OF THE ATTORNEY GENERAL | DEPUTY ATTORNEY GENERAL | 2600 | 7252020 | REP |
---|
citcont.shape
citcont.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 756205 entries, 0 to 756204
Data columns (total 8 columns):
CAND_NAME 756205 non-null object
NAME 756205 non-null object
STATE 756160 non-null object
EMPLOYER 737413 non-null object
OCCUPATION 741294 non-null object
TRANSACTION_AMT 756205 non-null int64
TRANSACTION_DT 756205 non-null int64
CAND_PTY_AFFILIATION 756205 non-null object
dtypes: int64(2), object(6)
memory usage: 51.9+ MB
#inplace True表示直接修改源对象
citcont['EMPLOYER'].fillna('NO PROVIDED',inplace=True)
citcont['OCCUPATION'].fillna('NO PROVIDED',inplace=True)
citcont['TRANSACTION_AMT'].fillna('NO PROVIDED',inplace=True)
#将日期7202020调整为2020720 要记得最后加[]再次形成列表
citcont['TRANSACTION_DT'] = citcont['TRANSACTION_DT'].astype(str)
citcont['TRANSACTION_DT'] =[ i[3:7]+i[0]+i[1:3] for i in citcont['TRANSACTION_DT'] ]
citcont.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 756205 entries, 0 to 756204
Data columns (total 8 columns):
CAND_NAME 756205 non-null object
NAME 756205 non-null object
STATE 756160 non-null object
EMPLOYER 756205 non-null object
OCCUPATION 756205 non-null object
TRANSACTION_AMT 756205 non-null int64
TRANSACTION_DT 756205 non-null object
CAND_PTY_AFFILIATION 756205 non-null object
dtypes: int64(1), object(7)
memory usage: 51.9+ MB
citcont.describe()
citcont['CAND_NAME'].describe()
| TRANSACTION_AMT |
---|
count | 7.562050e+05 |
---|
mean | 1.504307e+02 |
---|
std | 2.320452e+03 |
---|
min | -5.600000e+03 |
---|
25% | 2.000000e+01 |
---|
50% | 3.500000e+01 |
---|
75% | 1.000000e+02 |
---|
max | 1.500000e+06 |
---|
#按照领导人姓名、受赠团体等分组观察数据
citcont.groupby("CAND_PTY_AFFILIATION").sum().sort_values("TRANSACTION_AMT",ascending=False).head(10)
| TRANSACTION_AMT |
---|
CAND_PTY_AFFILIATION | |
---|
DEM | 75961730 |
---|
REP | 37170653 |
---|
IND | 328802 |
---|
LIB | 169202 |
---|
DFL | 76825 |
---|
GRE | 18607 |
---|
NON | 11256 |
---|
UNK | 10195 |
---|
CON | 4117 |
---|
BDY | 3250 |
---|
citcont.groupby("CAND_NAME").sum().sort_values("TRANSACTION_AMT",ascending=False).head(10)
| TRANSACTION_AMT |
---|
CAND_NAME | |
---|
BIDEN, JOSEPH R JR | 68111142 |
---|
TRUMP, DONALD J. | 16594982 |
---|
SULLIVAN, DAN | 9912465 |
---|
JACOBS, CHRISTOPHER L. | 6939209 |
---|
BLOOMBERG, MICHAEL R. | 3451916 |
---|
MARKEY, EDWARD J. SEN. | 606832 |
---|
SHAHEEN, JEANNE | 505446 |
---|
KENNEDY, JOSEPH P III | 467738 |
---|
CORNYN, JOHN SEN | 345959 |
---|
FIGLESTHALER, WILLIAM MATTHEW MD | 258221 |
---|
citcont.groupby("OCCUPATION").sum().sort_values("TRANSACTION_AMT",ascending=False).head(10)
| TRANSACTION_AMT |
---|
OCCUPATION | |
---|
NOT EMPLOYED | 24436214 |
---|
RETIRED | 18669950 |
---|
NO PROVIDED | 5086555 |
---|
ATTORNEY | 4443569 |
---|
FOUNDER | 3519109 |
---|
PHYSICIAN | 3295595 |
---|
CONSULTANT | 1647033 |
---|
LAWYER | 1565976 |
---|
PROFESSOR | 1481260 |
---|
EXECUTIVE | 1467865 |
---|
#查看每个州捐款人的数量
citcont["STATE"].value_counts().head(5)
CA 127895
TX 54457
FL 54343
NY 49453
MA 29314
Name: STATE, dtype: int64
# 导入matplotlib中的pyplot
import matplotlib.pyplot as plt
# 为了使matplotlib图形能够内联显示
%matplotlib inline
# 导入词云库
from wordcloud import WordCloud,ImageColorGenerator
#各州的捐赠数排名
res = citcont.groupby("STATE").sum().sort_values("TRANSACTION_AMT",ascending=False).head(10)
res = pd.DataFrame(res,columns=["TRANSACTION_AMT"])
res.plot(kind='bar')
#各个州的捐赠次数排名
res1 = citcont.groupby("STATE").size().sort_values(ascending=False).head(10)
res1.plot(kind='bar')
#拜登获得的捐赠在各个州的占比
biden = citcont[citcont["CAND_NAME"] == 'BIDEN, JOSEPH R JR']
biden_state = biden.groupby("STATE").sum().sort_values("TRANSACTION_AMT",ascending=False).head(10)
biden_state.plot.pie(figsize=(10,10),autopct='%0.2f%%',subplots=True)
# 将所有捐赠者姓名连接成一个字符串
data = ' '.join(biden["NAME"].tolist())
# 读取图片文件
bg = plt.imread("././././././././president_data/biden.jpg")
# 生成
wc = WordCloud(# FFFAE3
background_color="white", # 设置背景为白色,默认为黑色
width=890, # 设置图片的宽度
height=600, # 设置图片的高度
mask=bg, # 画布
margin=10, # 设置图片的边缘
max_font_size=100, # 显示的最大的字体大小
random_state=20, # 为每个单词返回一个PIL颜色
).generate_from_text(data)
# 图片背景
bg_color = ImageColorGenerator(bg)
# 开始画图
plt.imshow(wc.recolor(color_func=bg_color))
# 为云图去掉坐标轴
plt.axis("off")
# 画云图,显示
# 保存云图
#wc.to_file("biden_wordcloud.png")
import seaborn as sns
res = citcont.groupby("STATE").sum().sort_values("TRANSACTION_AMT",ascending=False).head(10)
sns.heatmap(res,cmap='Reds')
plt.show()
#收到捐赠额最多的两位候选人的总捐赠额变化趋势
biden = citcont[citcont["CAND_NAME"] == 'BIDEN, JOSEPH R JR']
trump = citcont[citcont["CAND_NAME"] == 'TRUMP, DONALD J.']
biden_tot = biden.groupby("TRANSACTION_DT").sum().sort_values("TRANSACTION_AMT",ascending=False)
trump_tot = trump.groupby("TRANSACTION_DT").sum().sort_values("TRANSACTION_AMT",ascending=False)
biden_tot.plot(kind='bar')
trump_tot.plot(kind='bar')