第九章 数据分组与聚合(下)



import pandas as pd
from pandas import Series
fec=pd.read_csv("e:/P00000001-ALL.csv")
fec[:2]
     cmte_id    cand_id             cand_nm        contbr_nm contbr_city  \
0  C00410118  P20002978  Bachmann, Michelle  HARVEY, WILLIAM      MOBILE   
1  C00410118  P20002978  Bachmann, Michelle  HARVEY, WILLIAM      MOBILE   

  contbr_st  contbr_zip contbr_employer contbr_occupation  contb_receipt_amt  \
0        AL  3.6601e+08         RETIRED           RETIRED              250.0   
1        AL  3.6601e+08         RETIRED           RETIRED               50.0   

  contb_receipt_dt receipt_desc memo_cd memo_text form_tp  file_num  
0        20-JUN-11          NaN     NaN       NaN   SA17A    736166  
1        23-JUN-11          N
fec.ix[123456]

unique_cands
cmte_id                             C00431445
cand_id                             P80003338
cand_nm                         Obama, Barack
contbr_nm                         ELLMAN, IRA
contbr_city                             TEMPE
contbr_st                                  AZ
contbr_zip                          852816719
contbr_employer      ARIZONA STATE UNIVERSITY
contbr_occupation                   PROFESSOR
contb_receipt_amt                          50
contb_receipt_dt                    01-DEC-11
receipt_desc                              NaN
memo_cd                                   NaN
memo_text                                 NaN
form_tp                                 SA17A
file_num                               772372
Name: 123456, dtype: object
unique_cands=fec.cand_nm.unique()
patries={'Bachmann, Michelle':'Republican','Cain, Herman':'Republican',
         'Gingrich, Newt':'Republican','Huntsman, Jon':'Republican',
         'Johnson, Gary Earl':'Republican','McCotter, Thaddeus G':'Republican',
         'Obama, Barack':'Democrat','Paul, Ron':'Republican',
         'Pawlenty, Timothy':'Republican','Perry, Rick':'Republican',
         "Roemer, Charles E. 'Buddy' III":'Republican',
         'Romney, Mitt':'Republican',
         'Santorum, Rick':'Republican'
         }

unique_cands[2]
'Obama, Barack'fec.cand_nm[123456:123461]
123456    Obama, Barack
123457    Obama, Barack
123458    Obama, Barack
123459    Obama, Barack
123460    Obama, Barack
Name: cand_nm, dtype: object
fec.cand_nm[123456:123461].map(patries)#通过这个映射以及Series对象的map方法,根据候选人姓名得到党派信息123456    Democrat
123457    Democrat
123458    Democrat
123459    Democrat
123460    Democrat
Name: cand_nm, dtype: object
#添加一个新列fec['party']=fec.cand_nm.map(patries)fec['party'].value_counts()
Democrat      593746
Republican    407985
Name: party, dtype: int64
#这里需要注意:该数字既包括赞助也包括退款(负的出资额),限定该数据集只能有正的出资额

fec=fec[fec.contb_receipt_amt>0]

#由于Barack Obama和Mitt Romney是最主要的两名候选人,建立一个子集只包含针对他们两人的竞选活动的赞助信息

fec_mrbo=fec[fec.cand_nm.isin(['Obama, Barack','Romney, Mitt'])]






根据职业和雇主统计信息

#首先根据职业计算总出资额

fec.contbr_occupation.value_counts()[:10]
Out[30]: 
RETIRED                                   233990
INFORMATION REQUESTED                      35107
ATTORNEY                                   34286
HOMEMAKER                                  29931
PHYSICIAN                                  23432
INFORMATION REQUESTED PER BEST EFFORTS     21138
ENGINEER                                   14334
TEACHER                                    13990
CONSULTANT                                 13273
PROFESSOR                                  12555
Name: contbr_occupation, dtype: int64

#巧妙运用dict.get,允许没有映射关系的职业也能通过,处理职业信息
occ_mapping={
             'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED',
             'INFORMATION REQUESTED':'NOT PROVIDED',
             'INFORMATION REQUESTED(BEST EFFORTS)':'NOT PROVIDED',
             'C.E.O.':'CEO'
             }
#如果没有提供相关映射,则返回x
f=lambda x:occ_mapping.get(x,x)
fec.contbr_employer=fec.contbr_employer.map(f)

#通过pivot_table根据党派和职业对数据进行聚合,过滤掉总出资额不足200万美元的数据

by_occupation=fec.pivot_table('contb_receipt_amt',index='contbr_occupation',
                              columns='party',aggfunc='sum')

over_2mm=by_occupation[by_occupation.sum(1)>2000000]

over_2mm
Out[42]: 
party                                      Democrat    Republican
contbr_occupation                                                
ATTORNEY                                11141982.97  7.477194e+06
C.E.O.                                      1690.00  2.592983e+06
CEO                                      2073284.79  1.618057e+06
CONSULTANT                               2459912.71  2.544725e+06
ENGINEER                                  951525.55  1.818374e+06
EXECUTIVE                                1355161.05  4.138850e+06
HOMEMAKER                                4248875.80  1.363428e+07
INFORMATION REQUESTED                    4866973.96  3.896616e+06
INFORMATION REQUESTED PER BEST EFFORTS          NaN  1.634053e+07
INVESTOR                                  884133.00  2.431769e+06
LAWYER                                   3160478.87  3.912243e+05
MANAGER                                   762883.22  1.444532e+06
OWNER                                    1001567.36  2.408287e+06
PHYSICIAN                                3735124.94  3.594320e+06
PRESIDENT                                1878509.95  4.720924e+06
PROFESSOR                                2165071.08  2.967027e+05
REAL ESTATE                               528902.09  1.625902e+06
RETIRED                                 25305116.38  2.356124e+07
SELF-EMPLOYED                             672393.40  1.640253e+06
import matplotlib
%matplotlib inline
over_2mm.plot(kind='barh')




def get_top_amounts(group,key,n=5):
    totals=group.groupby(key)['contb_receipt_amt'].sum()

#    根据key对totals进行降序排列
    return totals.sort_values(ascending=False)[n:]
#根据职业和雇主进行聚合

grouped=fec_mrbo.groupby('cand_nm')

grouped.apply(get_top_amounts,'contbr_occupation',n=7)
Out[56]: 
cand_nm        contbr_occupation                     
Obama, Barack  PROFESSOR                                 2165071.08
               CEO                                       2073284.79
               PRESIDENT                                 1878509.95
               NOT EMPLOYED                              1709188.20
               EXECUTIVE                                 1355161.05
               TEACHER                                   1250969.15
               WRITER                                    1084188.88
               OWNER                                     1001567.36
               ENGINEER                                   951525.55
               INVESTOR                                   884133.00
               ARTIST                                     763125.00
               MANAGER                                    762883.22
               SELF-EMPLOYED                              672393.40
               STUDENT                                    628099.75
               REAL ESTATE                                528902.09
               CHAIRMAN                                   496547.00
               ARCHITECT                                  483859.89
               DIRECTOR                                   471741.73
               BUSINESS OWNER                             449979.30
               EDUCATOR                                   436600.89
               PSYCHOLOGIST                               427299.92
               SOFTWARE ENGINEER                          396985.65
               PARTNER                                    395759.50
               SALES                                      392886.91
               EXECUTIVE DIRECTOR                         348180.94
               MANAGING DIRECTOR                          329688.25
               SOCIAL WORKER                              326844.43
               VICE PRESIDENT                             325647.15
               ADMINISTRATOR                              323079.26
               SCIENTIST                                  319227.88
   
Romney, Mitt   NON-PROFIT VETERANS ORG. CHAIR/ANNUITA         10.00
               PARAPLANNER                                    10.00
               APPRAISAL                                      10.00
               SIGN CONTRACTOR                                10.00
               POLITICAL OPERATIVE                            10.00
               PORT MGT                                       10.00
               PRESIDENT EMERITUS                             10.00
               CONTRACTS SPECIALIST                            9.00
               TEACHER & FREE-LANCE JOURNALIST                 9.00
               FOUNDATION CONSULTANT                           6.00
               MAIL HANDLER                                    6.00
               TREASURER & DIRECTOR OF FINANCE                 6.00
               SECRETARY/BOOKKEPPER                            6.00
               ELAYNE WELLS HARMER                             6.00
               CHICKEN GRADER                                  5.00
               DIRECTOR REISCHAUER CENTER FOR EAST A           5.00
               SCOTT GREENBAUM                                 5.00
               EDUCATION ADMIN                                 5.00
               ENGINEER/RISK EXPERT                            5.00
               PLANNING AND OPERATIONS ANALYST                 5.00
               VILLA NOVA                                      5.00
               FINANCIAL INSTITUTION - CEO                     5.00
               HORTICULTURIST                                  5.00
               MD - UROLOGIST                                  5.00
               DISTRICT REPRESENTATIVE                         5.00
               INDEPENDENT PROFESSIONAL                        3.00
               REMODELER & SEMI RETIRED                        3.00
               AFFORDABLE REAL ESTATE DEVELOPER                3.00
               IFC CONTRACTING SOLUTIONS                       3.00
               3RD GENERATION FAMILY BUSINESS OWNER            3.00
Name: contb_receipt_amt, dtype: float64
grouped.apply(get_top_amounts,'contbr_occupation',n=10)
Out[57]: 
cand_nm        contbr_occupation                     
Obama, Barack  NOT EMPLOYED                              1709188.20
               EXECUTIVE                                 1355161.05
               TEACHER                                   1250969.15
               WRITER                                    1084188.88
               OWNER                                     1001567.36
               ENGINEER                                   951525.55
               INVESTOR                                   884133.00
               ARTIST                                     763125.00
               MANAGER                                    762883.22
               SELF-EMPLOYED                              672393.40
               STUDENT                                    628099.75
               REAL ESTATE                                528902.09
               CHAIRMAN                                   496547.00
               ARCHITECT                                  483859.89
               DIRECTOR                                   471741.73
               BUSINESS OWNER                             449979.30
               EDUCATOR                                   436600.89
               PSYCHOLOGIST                               427299.92
               SOFTWARE ENGINEER                          396985.65
               PARTNER                                    395759.50
               SALES                                      392886.91
               EXECUTIVE DIRECTOR                         348180.94
               MANAGING DIRECTOR                          329688.25
               SOCIAL WORKER                              326844.43
               VICE PRESIDENT                             325647.15
               ADMINISTRATOR                              323079.26
               SCIENTIST                                  319227.88
               VOLUNTEER                                  305233.64
               FINANCE                                    296031.40
               MARKETING                                  263610.68
   
Romney, Mitt   NON-PROFIT VETERANS ORG. CHAIR/ANNUITA         10.00
               PARAPLANNER                                    10.00
               APPRAISAL                                      10.00
               SIGN CONTRACTOR                                10.00
               POLITICAL OPERATIVE                            10.00
               PORT MGT                                       10.00
               PRESIDENT EMERITUS                             10.00
               CONTRACTS SPECIALIST                            9.00
               TEACHER & FREE-LANCE JOURNALIST                 9.00
               FOUNDATION CONSULTANT                           6.00
               MAIL HANDLER                                    6.00
               TREASURER & DIRECTOR OF FINANCE                 6.00
               SECRETARY/BOOKKEPPER                            6.00
               ELAYNE WELLS HARMER                             6.00
               CHICKEN GRADER                                  5.00
               DIRECTOR REISCHAUER CENTER FOR EAST A           5.00
               SCOTT GREENBAUM                                 5.00
               EDUCATION ADMIN                                 5.00
               ENGINEER/RISK EXPERT                            5.00
               PLANNING AND OPERATIONS ANALYST                 5.00
               VILLA NOVA                                      5.00
               FINANCIAL INSTITUTION - CEO                     5.00
               HORTICULTURIST                                  5.00
               MD - UROLOGIST                                  5.00
               DISTRICT REPRESENTATIVE                         5.00
               INDEPENDENT PROFESSIONAL                        3.00
               REMODELER & SEMI RETIRED                        3.00
               AFFORDABLE REAL ESTATE DEVELOPER                3.00
               IFC CONTRACTING SOLUTIONS                       3.00
               3RD GENERATION FAMILY BUSINESS OWNER            3.00
Name: contb_receipt_amt, dtype: float64
对出资额分组

import numpy as np
bins=np.array([0,1,10,100,1000,10000,100000,1000000,10000000])
labels=pd.cut(fec_mrbo.contb_receipt_amt,bins)
labels
411           (10, 100]
412         (100, 1000]
413         (100, 1000]
414           (10, 100]
415           (10, 100]
416           (10, 100]
417         (100, 1000]
418           (10, 100]
419         (100, 1000]
420           (10, 100]
421           (10, 100]
422         (100, 1000]
423         (100, 1000]
424         (100, 1000]
425         (100, 1000]
426         (100, 1000]
427       (1000, 10000]
428         (100, 1000]
429         (100, 1000]
430           (10, 100]
431       (1000, 10000]
432         (100, 1000]
433         (100, 1000]
434         (100, 1000]
435         (100, 1000]
436         (100, 1000]
437           (10, 100]
438         (100, 1000]
439         (100, 1000]
440           (10, 100]
     
701356        (10, 100]
701357          (1, 10]
701358        (10, 100]
701359        (10, 100]
701360        (10, 100]
701361        (10, 100]
701362      (100, 1000]
701363        (10, 100]
701364        (10, 100]
701365        (10, 100]
701366        (10, 100]
701367        (10, 100]
701368      (100, 1000]
701369        (10, 100]
701370        (10, 100]
701371        (10, 100]
701372        (10, 100]
701373        (10, 100]
701374        (10, 100]
701375        (10, 100]
701376    (1000, 10000]
701377        (10, 100]
701378        (10, 100]
701379      (100, 1000]
701380    (1000, 10000]
701381        (10, 100]
701382      (100, 1000]
701383          (1, 10]
701384        (10, 100]
701385      (100, 1000]
Name: contb_receipt_amt, dtype: category
Categories (8, object): [(0, 1] < (1, 10] < (10, 100] < (100, 1000] < (1000, 10000] < (10000, 100000] < (100000, 1000000] < (1000000, 10000000]]
#根据候选人姓名以及面元标签对数据进行分组
grouped=fec_mrbo.groupby(['cand_nm',labels])
grouped.size().unstack(0)
Out[67]: 
cand_nm              Obama, Barack  Romney, Mitt
contb_receipt_amt                               
(0, 1]                       493.0          77.0
(1, 10]                    40070.0        3681.0
(10, 100]                 372280.0       31853.0
(100, 1000]               153991.0       43357.0
(1000, 10000]              22284.0       26186.0
(10000, 100000]                2.0           1.0
(100000, 1000000]              3.0           NaN
(1000000, 10000000]            4.0           NaN
#对出资额求合并在面元内规格化,以便图形化显示两位候选人各种赞助额度
bucket_sums
Out[69]: 
cand_nm              Obama, Barack  Romney, Mitt
contb_receipt_amt                               
(0, 1]                      318.24         77.00
(1, 10]                  337267.62      29819.66
(10, 100]              20288981.41    1987783.76
(100, 1000]            54798531.46   22363381.69
(1000, 10000]          51753705.67   63942145.42
(10000, 100000]           59100.00      12700.00
(100000, 1000000]       1490683.08           NaN
(1000000, 10000000]     7148839.76           NaN
normed_sums=bucket_sums.div(bucket_sums.sum(axis=1),axis=0)
normed_sums
Out[71]: 
cand_nm              Obama, Barack  Romney, Mitt
contb_receipt_amt                               
(0, 1]                    0.805182      0.194818
(1, 10]                   0.918767      0.081233
(10, 100]                 0.910769      0.089231
(100, 1000]               0.710176      0.289824
(1000, 10000]             0.447326      0.552674
(10000, 100000]           0.823120      0.176880
(100000, 1000000]         1.000000           NaN
(1000000, 10000000]       1.000000           NaN
normed_sums[:-2].plot(kind='barh',stacked=True)




#根据州统计赞助信息






























评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值