龙珠训练营python-Pandas揭秘美国选民的总统喜好

阿里天池龙珠训练营python数据分析项目

用Pandas揭秘美国选民的总统喜好

链接https://tianchi.aliyun.com/competition/entrance/531837/introduction?spm=5176.19700039.J_9059755190.8.31b83ff4xsaTH1

import pandas as pd

#读取并命名 names为每列数据命名
candidates = pd.read_csv("./president_data/weball20.txt",sep = '|',names=['CAND_ID','CAND_NAME','CAND_ICI','PTY_CD','CAND_PTY_AFFILIATION','TTL_RECEIPTS',
                                                          'TRANS_FROM_AUTH','TTL_DISB','TRANS_TO_AUTH','COH_BOP','COH_COP','CAND_CONTRIB',
                                                          'CAND_LOANS','OTHER_LOANS','CAND_LOAN_REPAY','OTHER_LOAN_REPAY','DEBTS_OWED_BY',
                                                          'TTL_INDIV_CONTRIB','CAND_OFFICE_ST','CAND_OFFICE_DISTRICT','SPEC_ELECTION','PRIM_ELECTION','RUN_ELECTION'
                                                          ,'GEN_ELECTION','GEN_ELECTION_PRECENT','OTHER_POL_CMTE_CONTRIB','POL_PTY_CONTRIB',
                                                          'CVG_END_DT','INDIV_REFUNDS','CMTE_REFUNDS'])
ccl = pd.read_csv("./president_data/ccl.txt",sep='|',names=['CAND_ID','CAND_ELECTION_YR','FEC_ELECTION_YR','CMTE_ID','CMTE_TP','CMTE_DSGN','LINKAGE_ID'])
ccl = pd.merge(ccl,candidates)
#提取需要的列使用pandas中的DataFrame结构
#委员会ID  候选人ID 姓名 党派
ccl = pd.DataFrame(ccl,columns=['CMTE_ID','CAND_ID', 'CAND_NAME','CAND_PTY_AFFILIATION'])
ccl.head(10)
 CMTE_IDCAND_IDCAND_NAMECAND_PTY_AFFILIATION
0C00697789H0AL01055CARL, JERRY LEE, JRREP
1C00701557H0AL01063LAMBERT, DOUGLAS WESTLEY IIIREP
2C00701409H0AL01071PRINGLE, CHRISTOPHER PAULREP
3C00703066H0AL01089HIGHTOWER, BILLREP
4C00708867H0AL01097AVERHART, JAMESDEM
5C00710947H0AL01105GARDNER, KIANI ADEM
6C00722512H0AL01121CASTORANI, JOHNREP
7C00725069H0AL01139COLLINS, FREDERICK G. RICK'DEM
8C00462143H0AL02087ROBY, MARTHAREP
9C00493783H0AL02087ROBY, MARTHAREP
itcont = pd.read_csv("./president_data/itcont_2020_20200722_20200820.txt",sep='|',names=['CMTE_ID','AMNDT_IND','RPT_TP','TRANSACTION_PGI',
                                                                                  'IMAGE_NUM','TRANSACTION_TP','ENTITY_TP','NAME','CITY',
                                                                                  'STATE','ZIP_CODE','EMPLOYER','OCCUPATION','TRANSACTION_DT',
                                                                                  'TRANSACTION_AMT','OTHER_ID','TRAN_ID','FILE_NUM','MEMO_CD',
                                                                                  'MEMO_TEXT','SUB_ID'])
citcont = pd.merge(ccl,itcont)
#接受捐赠的候选人姓名  捐赠人姓名 所在州 所在公司 职业 数额(美元) 收到捐款的日期 候选人党派
citcont = pd.DataFrame(citcont,columns=['CAND_NAME','NAME', 'STATE','EMPLOYER','OCCUPATION',
                                           'TRANSACTION_AMT', 'TRANSACTION_DT','CAND_PTY_AFFILIATION'])
citcont.head(10)
 CAND_NAMENAMESTATEEMPLOYEROCCUPATIONTRANSACTION_AMTTRANSACTION_DTCAND_PTY_AFFILIATION
0MORGAN, JOSEPH DAVIDMARTIN, WILLIAM IIAZRETIREDRETIRED1007242020REP
1MORGAN, JOSEPH DAVIDRODRIGUEZ, GERARDOAZVA HOSPITALLAB TECH407242020REP
2MORGAN, JOSEPH DAVIDRODRIGUEZ, GERARDOAZVA HOSPITALLAB TECH407312020REP
3WOOD, DANIELHOPKINS, RICHARDAZPOWERS-LEAVITTINSURANCE AGENT3008102020REP
4WOOD, DANIELPENDLETON, DIANEAZUNEMPLOYEDNaN5008072020REP
5WOOD, DANIELPREVATT, WILLIAMAZSELF-EMPLOYEDDVM5007312020REP
6WOOD, DANIELHARDING, DOUGAZMICROSUREOPERATIONS MANAGER28008102020REP
7WOOD, DANIELHARDING, MARIAZNaNNaN14008152020REP
8WOOD, DANIELHEDGER, CYNTHIATXNaNNaN2007312020REP
9HUANG, PEGGYHUANG - PERSONAL FUNDS, PEGGYCAOFFICE OF THE ATTORNEY GENERALDEPUTY ATTORNEY GENERAL26007252020REP
citcont.shape
citcont.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 756205 entries, 0 to 756204
Data columns (total 8 columns):
CAND_NAME               756205 non-null object
NAME                    756205 non-null object
STATE                   756160 non-null object
EMPLOYER                737413 non-null object
OCCUPATION              741294 non-null object
TRANSACTION_AMT         756205 non-null int64
TRANSACTION_DT          756205 non-null int64
CAND_PTY_AFFILIATION    756205 non-null object
dtypes: int64(2), object(6)
memory usage: 51.9+ MB

#inplace True表示直接修改源对象
citcont['EMPLOYER'].fillna('NO PROVIDED',inplace=True)
citcont['OCCUPATION'].fillna('NO PROVIDED',inplace=True)
citcont['TRANSACTION_AMT'].fillna('NO PROVIDED',inplace=True)
#将日期7202020调整为2020720 要记得最后加[]再次形成列表
citcont['TRANSACTION_DT'] = citcont['TRANSACTION_DT'].astype(str)
citcont['TRANSACTION_DT'] =[ i[3:7]+i[0]+i[1:3] for i in citcont['TRANSACTION_DT'] ]
citcont.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 756205 entries, 0 to 756204
Data columns (total 8 columns):
CAND_NAME               756205 non-null object
NAME                    756205 non-null object
STATE                   756160 non-null object
EMPLOYER                756205 non-null object
OCCUPATION              756205 non-null object
TRANSACTION_AMT         756205 non-null int64
TRANSACTION_DT          756205 non-null object
CAND_PTY_AFFILIATION    756205 non-null object
dtypes: int64(1), object(7)
memory usage: 51.9+ MB
citcont.describe()
citcont['CAND_NAME'].describe()
 TRANSACTION_AMT
count7.562050e+05
mean1.504307e+02
std2.320452e+03
min-5.600000e+03
25%2.000000e+01
50%3.500000e+01
75%1.000000e+02
max1.500000e+06
#按照领导人姓名、受赠团体等分组观察数据
citcont.groupby("CAND_PTY_AFFILIATION").sum().sort_values("TRANSACTION_AMT",ascending=False).head(10)
 TRANSACTION_AMT
CAND_PTY_AFFILIATION 
DEM75961730
REP37170653
IND328802
LIB169202
DFL76825
GRE18607
NON11256
UNK10195
CON4117
BDY3250
citcont.groupby("CAND_NAME").sum().sort_values("TRANSACTION_AMT",ascending=False).head(10)
 TRANSACTION_AMT
CAND_NAME 
BIDEN, JOSEPH R JR68111142
TRUMP, DONALD J.16594982
SULLIVAN, DAN9912465
JACOBS, CHRISTOPHER L.6939209
BLOOMBERG, MICHAEL R.3451916
MARKEY, EDWARD J. SEN.606832
SHAHEEN, JEANNE505446
KENNEDY, JOSEPH P III467738
CORNYN, JOHN SEN345959
FIGLESTHALER, WILLIAM MATTHEW MD258221
citcont.groupby("OCCUPATION").sum().sort_values("TRANSACTION_AMT",ascending=False).head(10)

 TRANSACTION_AMT
OCCUPATION 
NOT EMPLOYED24436214
RETIRED18669950
NO PROVIDED5086555
ATTORNEY4443569
FOUNDER3519109
PHYSICIAN3295595
CONSULTANT1647033
LAWYER1565976
PROFESSOR1481260
EXECUTIVE1467865
#查看每个州捐款人的数量
citcont["STATE"].value_counts().head(5)
CA    127895
TX     54457
FL     54343
NY     49453
MA     29314
Name: STATE, dtype: int64


# 导入matplotlib中的pyplot
import matplotlib.pyplot as plt
# 为了使matplotlib图形能够内联显示
%matplotlib inline
# 导入词云库
from wordcloud import WordCloud,ImageColorGenerator

#各州的捐赠数排名
res = citcont.groupby("STATE").sum().sort_values("TRANSACTION_AMT",ascending=False).head(10)
res = pd.DataFrame(res,columns=["TRANSACTION_AMT"])
res.plot(kind='bar')

 
#各个州的捐赠次数排名
res1 = citcont.groupby("STATE").size().sort_values(ascending=False).head(10)
res1.plot(kind='bar')

 
#拜登获得的捐赠在各个州的占比
biden = citcont[citcont["CAND_NAME"] == 'BIDEN, JOSEPH R JR']
biden_state = biden.groupby("STATE").sum().sort_values("TRANSACTION_AMT",ascending=False).head(10)
biden_state.plot.pie(figsize=(10,10),autopct='%0.2f%%',subplots=True)
 

 
# 将所有捐赠者姓名连接成一个字符串
data = ' '.join(biden["NAME"].tolist())
# 读取图片文件
bg = plt.imread("././././././././president_data/biden.jpg")
# 生成
wc = WordCloud(# FFFAE3
    background_color="white",  # 设置背景为白色,默认为黑色
    width=890,  # 设置图片的宽度
    height=600,  # 设置图片的高度
    mask=bg,    # 画布
    margin=10,  # 设置图片的边缘
    max_font_size=100,  # 显示的最大的字体大小
    random_state=20,  # 为每个单词返回一个PIL颜色
).generate_from_text(data)
# 图片背景
bg_color = ImageColorGenerator(bg)
# 开始画图
plt.imshow(wc.recolor(color_func=bg_color))
# 为云图去掉坐标轴
plt.axis("off")
# 画云图,显示
# 保存云图
#wc.to_file("biden_wordcloud.png")

 

 
import seaborn as sns
res = citcont.groupby("STATE").sum().sort_values("TRANSACTION_AMT",ascending=False).head(10)
sns.heatmap(res,cmap='Reds')
plt.show()
 
#收到捐赠额最多的两位候选人的总捐赠额变化趋势
biden = citcont[citcont["CAND_NAME"] == 'BIDEN, JOSEPH R JR']
trump = citcont[citcont["CAND_NAME"] == 'TRUMP, DONALD J.']
biden_tot = biden.groupby("TRANSACTION_DT").sum().sort_values("TRANSACTION_AMT",ascending=False)
trump_tot = trump.groupby("TRANSACTION_DT").sum().sort_values("TRANSACTION_AMT",ascending=False)
biden_tot.plot(kind='bar')
trump_tot.plot(kind='bar')

 
 
 
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值