2012联邦选举委员会数据库中赞助人和赞助模式的分析统计

本例为《利用Python进行数据分析》中第九章最后一节实例,数据来源Github
https://github.com/wesm/pydata-book/tree/2nd-edition/datasets/fec

#2012联邦选举委员会数据库中赞助人和赞助模式的分析统计
import pandas as pd
import numpy as np
from pandas import Series,DataFrame
from numpy.random import randn,rand
import matplotlib.pyplot as plt
fec=pd.read_csv('P00000001-ALL.csv')
fec.head()
D:\application_setup\Anaconda\lib\site-packages\IPython\core\interactiveshell.py:3063: DtypeWarning: Columns (6) have mixed types.Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
cmte_idcand_idcand_nmcontbr_nmcontbr_citycontbr_stcontbr_zipcontbr_employercontbr_occupationcontb_receipt_amtcontb_receipt_dtreceipt_descmemo_cdmemo_textform_tpfile_num
0C00410118P20002978Bachmann, MichelleHARVEY, WILLIAMMOBILEAL3.6601e+08RETIREDRETIRED250.020-JUN-11NaNNaNNaNSA17A736166
1C00410118P20002978Bachmann, MichelleHARVEY, WILLIAMMOBILEAL3.6601e+08RETIREDRETIRED50.023-JUN-11NaNNaNNaNSA17A736166
2C00410118P20002978Bachmann, MichelleSMITH, LANIERLANETTAL3.68633e+08INFORMATION REQUESTEDINFORMATION REQUESTED250.005-JUL-11NaNNaNNaNSA17A749073
3C00410118P20002978Bachmann, MichelleBLEVINS, DARONDAPIGGOTTAR7.24548e+08NONERETIRED250.001-AUG-11NaNNaNNaNSA17A749073
4C00410118P20002978Bachmann, MichelleWARDENBURG, HAROLDHOT SPRINGS NATIONAR7.19016e+08NONERETIRED300.020-JUN-11NaNNaNNaNSA17A736166
fec.columns
Index(['cmte_id', 'cand_id', 'cand_nm', 'contbr_nm', 'contbr_city',
       'contbr_st', 'contbr_zip', 'contbr_employer', 'contbr_occupation',
       'contb_receipt_amt', 'contb_receipt_dt', 'receipt_desc', 'memo_cd',
       'memo_text', 'form_tp', 'file_num'],
      dtype='object')
fec.iloc[123456]
cmte_id                             C00431445
cand_id                             P80003338
cand_nm                         Obama, Barack
contbr_nm                         ELLMAN, IRA
contbr_city                             TEMPE
contbr_st                                  AZ
contbr_zip                          852816719
contbr_employer      ARIZONA STATE UNIVERSITY
contbr_occupation                   PROFESSOR
contb_receipt_amt                          50
contb_receipt_dt                    01-DEC-11
receipt_desc                              NaN
memo_cd                                   NaN
memo_text                                 NaN
form_tp                                 SA17A
file_num                               772372
Name: 123456, dtype: object
unique_cands=fec.cand_nm.unique()
unique_cands
array(['Bachmann, Michelle', 'Romney, Mitt', 'Obama, Barack',
       "Roemer, Charles E. 'Buddy' III", 'Pawlenty, Timothy',
       'Johnson, Gary Earl', 'Paul, Ron', 'Santorum, Rick',
       'Cain, Herman', 'Gingrich, Newt', 'McCotter, Thaddeus G',
       'Huntsman, Jon', 'Perry, Rick'], dtype=object)
unique_cands[2]
'Obama, Barack'
#竞选人所属政党
parties={'Bachmann, Michelle':'Republican','Romney, Mitt':'Republican','Gingrich, Newt':'Republican',
        'Huntsman, Jon':'Republican','Johnson, Gary Earl':'Republican',
        'McCotter, Thaddeus G':'Republican','Obama, Barack':'Democrat',
        'Paul, Ron':'Republican','Pawlenty, Timothy':'Republican',
        'Perry, Rick':'Republican',"Roemer, Charles E. 'Buddy' III":'Republican',
        'Romney, Mitt':'Republican','Santorum, Rick':'Republican'}
#这里出现了一个问题,就是我辛辛苦苦把竞选人匹配党派敲出来以后,一检查,发现#这里出现了一个问题,
#parties字典长度跟unique_cands的长度不匹配,这不是坑爹么,应该是少敲了一个
len(parties)
12
len(unique_cands)
13
cands=[name for name in parties]#取parties字典的key值,以列表推导式形式输出
cands=np.array(cands)

cands
array(['Bachmann, Michelle', 'Romney, Mitt', 'Gingrich, Newt',
       'Huntsman, Jon', 'Johnson, Gary Earl', 'McCotter, Thaddeus G',
       'Obama, Barack', 'Paul, Ron', 'Pawlenty, Timothy', 'Perry, Rick',
       "Roemer, Charles E. 'Buddy' III", 'Santorum, Rick'], dtype='<U30')
#np.in1d()为测试一个数组在另一个数组中的成员资格,
np.in1d(unique_cands,cands)
array([ True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True])
#np.setdiff1d()集合差函数
np.setdiff1d(unique_cands,unique_cands[np.in1d(unique_cands,cands)])
array(['Cain, Herman'], dtype=object)
#把漏掉的加到parties字典上
parties['Cain, Herman']='Republican'
parties
{'Bachmann, Michelle': 'Republican',
 'Romney, Mitt': 'Republican',
 'Gingrich, Newt': 'Republican',
 'Huntsman, Jon': 'Republican',
 'Johnson, Gary Earl': 'Republican',
 'McCotter, Thaddeus G': 'Republican',
 'Obama, Barack': 'Democrat',
 'Paul, Ron': 'Republican',
 'Pawlenty, Timothy': 'Republican',
 'Perry, Rick': 'Republican',
 "Roemer, Charles E. 'Buddy' III": 'Republican',
 'Santorum, Rick': 'Republican',
 'Cain, Herman': 'Republican'}
#在原数据集上新增党派关系列
fec['party']=fec.cand_nm.map(parties)
fec.head()
cmte_idcand_idcand_nmcontbr_nmcontbr_citycontbr_stcontbr_zipcontbr_employercontbr_occupationcontb_receipt_amtcontb_receipt_dtreceipt_descmemo_cdmemo_textform_tpfile_numparty
0C00410118P20002978Bachmann, MichelleHARVEY, WILLIAMMOBILEAL3.6601e+08RETIREDRETIRED250.020-JUN-11NaNNaNNaNSA17A736166Republican
1C00410118P20002978Bachmann, MichelleHARVEY, WILLIAMMOBILEAL3.6601e+08RETIREDRETIRED50.023-JUN-11NaNNaNNaNSA17A736166Republican
2C00410118P20002978Bachmann, MichelleSMITH, LANIERLANETTAL3.68633e+08INFORMATION REQUESTEDINFORMATION REQUESTED250.005-JUL-11NaNNaNNaNSA17A749073Republican
3C00410118P20002978Bachmann, MichelleBLEVINS, DARONDAPIGGOTTAR7.24548e+08NONERETIRED250.001-AUG-11NaNNaNNaNSA17A749073Republican
4C00410118P20002978Bachmann, MichelleWARDENBURG, HAROLDHOT SPRINGS NATIONAR7.19016e+08NONERETIRED300.020-JUN-11NaNNaNNaNSA17A736166Republican
#该数据赞助也包括退款,及存在负数出资额
(fec.contb_receipt_amt>0).value_counts()
True     991475
False     10256
Name: contb_receipt_amt, dtype: int64
fec.cand_nm.isin(['Obama, Barack','Romney, Mitt'])
0          False
1          False
2          False
3          False
4          False
           ...  
1001726    False
1001727    False
1001728    False
1001729    False
1001730    False
Name: cand_nm, Length: 1001731, dtype: bool
#筛选出只包含指定两位竞选人的赞助信息
fec_mrbo=fec[fec.cand_nm.isin(['Obama, Barack','Romney, Mitt'])]
fec_mrbo.head()
cmte_idcand_idcand_nmcontbr_nmcontbr_citycontbr_stcontbr_zipcontbr_employercontbr_occupationcontb_receipt_amtcontb_receipt_dtreceipt_descmemo_cdmemo_textform_tpfile_numparty
411C00431171P80003353Romney, MittELDERBAUM, WILLIAMDPOAA3.4023e+08US GOVERNMENTFOREIGN SERVICE OFFICER25.001-FEB-12NaNNaNNaNSA17A780124Republican
412C00431171P80003353Romney, MittELDERBAUM, WILLIAMDPOAA3.4023e+08US GOVERNMENTFOREIGN SERVICE OFFICER110.001-FEB-12NaNNaNNaNSA17A780124Republican
413C00431171P80003353Romney, MittCARLSEN, RICHARDAPOAE9.128e+07DEFENSE INTELLIGENCE AGENCYINTELLIGENCE ANALYST250.013-APR-12NaNNaNNaNSA17A785689Republican
414C00431171P80003353Romney, MittDELUCA, PIERREAPOAE9.128e+07CISCOENGINEER30.021-AUG-11NaNNaNNaNSA17A760261Republican
415C00431171P80003353Romney, MittSARGENT, MICHAELAPOAE9.01201e+07RAYTHEON TECHNICAL SERVICES CORPCOMPUTER SYSTEMS ENGINEER100.007-MAR-12NaNNaNNaNSA17A780128Republican
#根据职业和雇主统计赞助信息
occs=fec['contbr_occupation'].unique()
occs
array(['RETIRED', 'INFORMATION REQUESTED', 'RN', ...,
       'SAFETY SALES MANAGER', 'PRESIDENT & GENERAL MANAGER',
       'LONGWALL MAINTENANCE FOREMAN'], dtype=object)
len(fec['contbr_occupation'].unique())
45074
#许多职业都涉及相同的基本工作类型,或者同一样东西有多种变体。清理一些这类的数据
occ_mapping={'INFORMATION REQUESTED':'NOT PROVIDED','INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED',
            'INFORMATION REQUESTED(BEST EFFORTS)':'NOT PROVIDED','C.E.O.':'CEO'
}
fec.groupby('contbr_occupation')['party'].count()

contbr_occupation
   MIXED-MEDIA ARTIST / STORYTELLER    1
 AREA VICE PRESIDENT                   1
 RESEARCH ASSOCIATE                    1
 TEACHER                               1
 THERAPIST                             3
                                      ..
ZOOKEEPER                              1
ZOOLOGIST                              3
ZOOLOGY EDUCATION                      1
\NONE\                                 1
~                                      1
Name: party, Length: 45073, dtype: int64

#如果没有提供相关映射,返回x
f=lambda x: occ_mapping.get(x,x)
fec.contbr_occupation=fec.contbr_occupation.map(f)

#对雇主信息进行同样的处理
emp_mapping={'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED','INFORMATION REQUESTED':'NOT PROVIDED','SELF':'SELF-EMPLOYED','SELF EMPLOYED':'SELF-EMPLOYED'}
#如果没有提供相关的映射,则返回x
f=lambda x:emp_mapping.get(x,x)
fec.contbr_employer=fec.contbr_employer.map(f)
#根据党派和职业对数据进行整合,滤除总出资额不足200万美金的数据
by_occupation=fec.pivot_table(values='contb_receipt_amt',index='contbr_occupation',columns='party',aggfunc='sum')
by_occupation
partyDemocratRepublican
contbr_occupation
MIXED-MEDIA ARTIST / STORYTELLER100.0NaN
AREA VICE PRESIDENT250.0NaN
RESEARCH ASSOCIATE100.0NaN
TEACHER500.0NaN
THERAPIST3900.0NaN
.........
ZOOKEEPER35.0NaN
ZOOLOGIST400.0NaN
ZOOLOGY EDUCATION25.0NaN
\NONE\NaN250.0
~NaN75.0

45071 rows × 2 columns

#过滤掉总出资额不足200万美金的数据
over_2mm=by_occupation[by_occupation.sum(1)>2000000]
over_2mm
partyDemocratRepublican
contbr_occupation
ATTORNEY11126932.977.343540e+06
CEO2071474.793.932686e+06
CONSULTANT2459812.712.472815e+06
ENGINEER950425.551.780102e+06
EXECUTIVE1355161.053.918557e+06
HOMEMAKER4243394.301.324141e+07
INVESTOR884133.002.320349e+06
LAWYER3159391.873.785903e+05
MANAGER762693.221.404878e+06
NOT PROVIDED4849801.961.975207e+07
OWNER998867.362.279621e+06
PHYSICIAN3732387.443.491657e+06
PRESIDENT1878009.954.469834e+06
PROFESSOR2163571.082.944627e+05
REAL ESTATE528752.091.581747e+06
RETIRED25270507.232.290594e+07
SELF-EMPLOYED667393.401.591758e+06
#用直方图来展现更为清晰
over_2mm.plot(kind='barh')
<matplotlib.axes._subplots.AxesSubplot at 0x1700e31efc8>

在这里插入图片描述

#统计对Romney, Mitt和Obama, Barack总出资额最高的职业和企业家
fec_mrbo
cmte_idcand_idcand_nmcontbr_nmcontbr_citycontbr_stcontbr_zipcontbr_employercontbr_occupationcontb_receipt_amtcontb_receipt_dtreceipt_descmemo_cdmemo_textform_tpfile_numparty
411C00431171P80003353Romney, MittELDERBAUM, WILLIAMDPOAA3.4023e+08US GOVERNMENTFOREIGN SERVICE OFFICER25.001-FEB-12NaNNaNNaNSA17A780124Republican
412C00431171P80003353Romney, MittELDERBAUM, WILLIAMDPOAA3.4023e+08US GOVERNMENTFOREIGN SERVICE OFFICER110.001-FEB-12NaNNaNNaNSA17A780124Republican
413C00431171P80003353Romney, MittCARLSEN, RICHARDAPOAE9.128e+07DEFENSE INTELLIGENCE AGENCYINTELLIGENCE ANALYST250.013-APR-12NaNNaNNaNSA17A785689Republican
414C00431171P80003353Romney, MittDELUCA, PIERREAPOAE9.128e+07CISCOENGINEER30.021-AUG-11NaNNaNNaNSA17A760261Republican
415C00431171P80003353Romney, MittSARGENT, MICHAELAPOAE9.01201e+07RAYTHEON TECHNICAL SERVICES CORPCOMPUTER SYSTEMS ENGINEER100.007-MAR-12NaNNaNNaNSA17A780128Republican
......................................................
701381C00431445P80003338Obama, BarackMOUNTS, ROBERTAPOZZ962043077HQ USFK (FKDC-SA)GS-15 INTERNATIONAL RELATIONS OFFICER25.026-FEB-12NaNNaNNaNSA17A787803Democrat
701382C00431445P80003338Obama, BarackTAITANO, TYRONEHAGATNAZZ969323373NOT EMPLOYEDRETIRED250.020-JAN-12NaNNaNNaNSA17A775668Democrat
701383C00431445P80003338Obama, BarackTUCKER, PAMELAAPOZZ963190030DODEAEDUCATOR3.020-JAN-12NaNNaNNaNSA17A775668Democrat
701384C00431445P80003338Obama, BarackMOUNTS, ROBERTAPOZZ962043077HQ USFK (FKDC-SA)GS-15 INTERNATIONAL RELATIONS OFFICER25.026-APR-12NaNNaNNaNSA17A785239Democrat
701385C00431445P80003338Obama, BarackNEAL, AMBERAPOZZ091022065THE DEPARTMENT OF DEFENSE EDUCATION ACTEACHER135.004-SEP-11NaNX* OBAMA VICTORY FUND 2012SA18756218Democrat

700975 rows × 17 columns

grouped=fec_mrbo.groupby('cand_nm')
def get_top_amounts(group,key,n=5):
    totals=group.groupby(key)['contb_receipt_amt'].sum()
    return(totals.sort_values(ascending=False)[:n])
    
grouped.apply(get_top_amounts,'contbr_occupation',n=7)
cand_nm        contbr_occupation
Obama, Barack  RETIRED              25270507.23
               ATTORNEY             11126932.97
               NOT PROVIDED          4849801.96
               HOMEMAKER             4243394.30
               PHYSICIAN             3732387.44
               LAWYER                3159391.87
               CONSULTANT            2459812.71
Romney, Mitt   RETIRED              11266949.23
               NOT PROVIDED         11173374.84
               HOMEMAKER             8037250.86
               ATTORNEY              5302578.82
               PRESIDENT             2403439.77
               CEO                   2247242.03
               EXECUTIVE             2230653.79
Name: contb_receipt_amt, dtype: float64
grouped.apply(get_top_amounts,'contbr_employer',n=10)
cand_nm        contbr_employer   
Obama, Barack  RETIRED               22665902.20
               SELF-EMPLOYED         18584277.16
               NOT EMPLOYED           8584118.70
               NOT PROVIDED           5036178.37
               HOMEMAKER              2599987.04
               STUDENT                 318831.45
               VOLUNTEER               257104.00
               MICROSOFT               215585.36
               SIDLEY AUSTIN LLP       168254.00
               REFUSED                 149516.07
Romney, Mitt   NOT PROVIDED          11827237.12
               RETIRED               11264701.35
               HOMEMAKER              8037000.86
               SELF-EMPLOYED          7265136.53
               STUDENT                 488642.82
               CREDIT SUISSE           265650.00
               MORGAN STANLEY          262266.00
               GOLDMAN SACH & CO.      233250.00
               BARCLAYS CAPITAL        162750.00
               H.I.G. CAPITAL          139500.00
Name: contb_receipt_amt, dtype: float64
#对出资额分组
bins=np.array([0,1,10,100,1000,10000,100000,1000000,10000000])
labels=pd.cut(fec_mrbo.contb_receipt_amt,bins)
labels
411         (10, 100]
412       (100, 1000]
413       (100, 1000]
414         (10, 100]
415         (10, 100]
             ...     
701381      (10, 100]
701382    (100, 1000]
701383        (1, 10]
701384      (10, 100]
701385    (100, 1000]
Name: contb_receipt_amt, Length: 700975, dtype: category
Categories (8, interval[int64]): [(0, 1] < (1, 10] < (10, 100] < (100, 1000] < (1000, 10000] < (10000, 100000] < (100000, 1000000] < (1000000, 10000000]]
grouped=fec_mrbo.groupby(['cand_nm',labels])
grouped.size()
cand_nm        contb_receipt_amt  
Obama, Barack  (0, 1]                    493
               (1, 10]                 40070
               (10, 100]              372280
               (100, 1000]            153991
               (1000, 10000]           22284
               (10000, 100000]             2
               (100000, 1000000]           3
               (1000000, 10000000]         4
Romney, Mitt   (0, 1]                     77
               (1, 10]                  3681
               (10, 100]               31853
               (100, 1000]             43357
               (1000, 10000]           26186
               (10000, 100000]             1
               (100000, 1000000]           0
               (1000000, 10000000]         0
dtype: int64
grouped.size().unstack(0)
cand_nmObama, BarackRomney, Mitt
contb_receipt_amt
(0, 1]49377
(1, 10]400703681
(10, 100]37228031853
(100, 1000]15399143357
(1000, 10000]2228426186
(10000, 100000]21
(100000, 1000000]30
(1000000, 10000000]40
bucket_sums=grouped.contb_receipt_amt.sum()#对出资额求和
bucket_sums
cand_nm        contb_receipt_amt  
Obama, Barack  (0, 1]                      318.24
               (1, 10]                  337267.62
               (10, 100]              20288981.41
               (100, 1000]            54798531.46
               (1000, 10000]          51753705.67
               (10000, 100000]           59100.00
               (100000, 1000000]       1490683.08
               (1000000, 10000000]     7148839.76
Romney, Mitt   (0, 1]                       77.00
               (1, 10]                   29819.66
               (10, 100]               1987783.76
               (100, 1000]            22363381.69
               (1000, 10000]          63942145.42
               (10000, 100000]           12700.00
               (100000, 1000000]              NaN
               (1000000, 10000000]            NaN
Name: contb_receipt_amt, dtype: float64
bucket_sums=bucket_sums.unstack(0)
bucket_sums
cand_nmObama, BarackRomney, Mitt
contb_receipt_amt
(0, 1]318.2477.00
(1, 10]337267.6229819.66
(10, 100]20288981.411987783.76
(100, 1000]54798531.4622363381.69
(1000, 10000]51753705.6763942145.42
(10000, 100000]59100.0012700.00
(100000, 1000000]1490683.08NaN
(1000000, 10000000]7148839.76NaN
normed_sums=bucket_sums.div(bucket_sums.sum(axis=1),axis=0)#在面元内进行规格化
normed_sums
cand_nmObama, BarackRomney, Mitt
contb_receipt_amt
(0, 1]0.8051820.194818
(1, 10]0.9187670.081233
(10, 100]0.9107690.089231
(100, 1000]0.7101760.289824
(1000, 10000]0.4473260.552674
(10000, 100000]0.8231200.176880
(100000, 1000000]1.000000NaN
(1000000, 10000000]1.000000NaN
normed_sums[:-2].plot(kind='barh',stacked=True)
<matplotlib.axes._subplots.AxesSubplot at 0x1700ecfd988>

在这里插入图片描述

#根据州统计赞助信息
grouped=fec_mrbo.groupby(['cand_nm','contbr_st'])
totals=grouped.contb_receipt_amt.sum().unstack(0).fillna(0)
totals=totals[totals.sum(1)>100000]
totals[:10]
cand_nmObama, BarackRomney, Mitt
contbr_st
AK275353.1586204.24
AL537835.48504882.08
AR324802.28105351.50
AZ1484241.841850484.23
CA23370680.8410908232.46
CO2104551.381477810.82
CT2037216.663377421.85
DC4317865.85999740.50
DE325394.1481404.00
FL7138932.528008067.40
percent=totals.div(totals.sum(1),axis=0)
percent
cand_nmObama, BarackRomney, Mitt
contbr_st
AK0.7615750.238425
AL0.5158020.484198
AR0.7550840.244916
AZ0.4450870.554913
CA0.6817800.318220
CO0.5874760.412524
CT0.3762420.623758
DC0.8119940.188006
DE0.7998910.200109
FL0.4713100.528690
GA0.5874900.412510
HI0.8763340.123666
IA0.7413760.258624
ID0.2004240.799576
IL0.8276490.172351
IN0.6225490.377451
KS0.5751670.424833
KY0.5165020.483498
LA0.3578430.642157
MA0.5912560.408744
MD0.7506310.249369
ME0.9105300.089470
MI0.5227050.477295
MN0.8552630.144737
MO0.4951270.504873
MS0.3745930.625407
MT0.6530630.346937
NC0.6558790.344121
NE0.5874310.412569
NH0.5862260.413774
NJ0.4906870.509313
NM0.8441330.155867
NV0.5403620.459638
NY0.5953930.404607
OH0.4894120.510588
OK0.4135490.586451
OR0.6966950.303305
PA0.6618560.338144
PR0.8897720.110228
RI0.7078510.292149
SC0.6105310.389469
SD0.3459480.654052
TN0.4248300.575170
TX0.5185050.481495
UT0.1229480.877052
VA0.5554020.444598
VT0.9461760.053824
WA0.7656340.234366
WI0.8053430.194657
WV0.5738910.426109
WY0.4408230.559177
XX0.0000001.000000

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值