import pandas as pd from pandas import Series fec=pd.read_csv("e:/P00000001-ALL.csv") fec[:2] cmte_id cand_id cand_nm contbr_nm contbr_city \ 0 C00410118 P20002978 Bachmann, Michelle HARVEY, WILLIAM MOBILE 1 C00410118 P20002978 Bachmann, Michelle HARVEY, WILLIAM MOBILE contbr_st contbr_zip contbr_employer contbr_occupation contb_receipt_amt \ 0 AL 3.6601e+08 RETIRED RETIRED 250.0 1 AL 3.6601e+08 RETIRED RETIRED 50.0 contb_receipt_dt receipt_desc memo_cd memo_text form_tp file_num 0 20-JUN-11 NaN NaN NaN SA17A 736166 1 23-JUN-11 N
unique_candsfec.ix[123456]
cmte_id C00431445
cand_id P80003338
cand_nm Obama, Barack
contbr_nm ELLMAN, IRA
contbr_city TEMPE
contbr_st AZ
contbr_zip 852816719
contbr_employer ARIZONA STATE UNIVERSITY
contbr_occupation PROFESSOR
contb_receipt_amt 50
contb_receipt_dt 01-DEC-11
receipt_desc NaN
memo_cd NaN
memo_text NaN
form_tp SA17A
file_num 772372
Name: 123456, dtype: object
unique_cands=fec.cand_nm.unique()
unique_cands[2]patries={'Bachmann, Michelle':'Republican','Cain, Herman':'Republican', 'Gingrich, Newt':'Republican','Huntsman, Jon':'Republican', 'Johnson, Gary Earl':'Republican','McCotter, Thaddeus G':'Republican', 'Obama, Barack':'Democrat','Paul, Ron':'Republican', 'Pawlenty, Timothy':'Republican','Perry, Rick':'Republican', "Roemer, Charles E. 'Buddy' III":'Republican', 'Romney, Mitt':'Republican', 'Santorum, Rick':'Republican' }
'Obama, Barack'fec.cand_nm[123456:123461]
123456 Obama, Barack
123457 Obama, Barack
123458 Obama, Barack
123459 Obama, Barack
123460 Obama, Barack
Name: cand_nm, dtype: object
fec.cand_nm[123456:123461].map(patries)#通过这个映射以及Series对象的map方法,根据候选人姓名得到党派信息123456 Democrat
123457 Democrat
123458 Democrat
123459 Democrat
123460 Democrat
Name: cand_nm, dtype: object
#添加一个新列fec['party']=fec.cand_nm.map(patries)fec['party'].value_counts()
Democrat 593746
Republican 407985
Name: party, dtype: int64
#这里需要注意:该数字既包括赞助也包括退款(负的出资额),限定该数据集只能有正的出资额
fec=fec[fec.contb_receipt_amt>0]
#由于Barack Obama和Mitt Romney是最主要的两名候选人,建立一个子集只包含针对他们两人的竞选活动的赞助信息
fec_mrbo=fec[fec.cand_nm.isin(['Obama, Barack','Romney, Mitt'])]
根据职业和雇主统计信息
#首先根据职业计算总出资额 fec.contbr_occupation.value_counts()[:10] Out[30]: RETIRED 233990 INFORMATION REQUESTED 35107 ATTORNEY 34286 HOMEMAKER 29931 PHYSICIAN 23432 INFORMATION REQUESTED PER BEST EFFORTS 21138 ENGINEER 14334 TEACHER 13990 CONSULTANT 13273 PROFESSOR 12555 Name: contbr_occupation, dtype: int64 #巧妙运用dict.get,允许没有映射关系的职业也能通过,处理职业信息 occ_mapping={ 'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED', 'INFORMATION REQUESTED':'NOT PROVIDED', 'INFORMATION REQUESTED(BEST EFFORTS)':'NOT PROVIDED', 'C.E.O.':'CEO' } #如果没有提供相关映射,则返回x f=lambda x:occ_mapping.get(x,x) fec.contbr_employer=fec.contbr_employer.map(f) #通过pivot_table根据党派和职业对数据进行聚合,过滤掉总出资额不足200万美元的数据 by_occupation=fec.pivot_table('contb_receipt_amt',index='contbr_occupation', columns='party',aggfunc='sum') over_2mm=by_occupation[by_occupation.sum(1)>2000000] over_2mm Out[42]: party Democrat Republican contbr_occupation ATTORNEY 11141982.97 7.477194e+06 C.E.O. 1690.00 2.592983e+06 CEO 2073284.79 1.618057e+06 CONSULTANT 2459912.71 2.544725e+06 ENGINEER 951525.55 1.818374e+06 EXECUTIVE 1355161.05 4.138850e+06 HOMEMAKER 4248875.80 1.363428e+07 INFORMATION REQUESTED 4866973.96 3.896616e+06 INFORMATION REQUESTED PER BEST EFFORTS NaN 1.634053e+07 INVESTOR 884133.00 2.431769e+06 LAWYER 3160478.87 3.912243e+05 MANAGER 762883.22 1.444532e+06 OWNER 1001567.36 2.408287e+06 PHYSICIAN 3735124.94 3.594320e+06 PRESIDENT 1878509.95 4.720924e+06 PROFESSOR 2165071.08 2.967027e+05 REAL ESTATE 528902.09 1.625902e+06 RETIRED 25305116.38 2.356124e+07 SELF-EMPLOYED 672393.40 1.640253e+06 import matplotlib %matplotlib inline over_2mm.plot(kind='barh')
def get_top_amounts(group,key,n=5): totals=group.groupby(key)['contb_receipt_amt'].sum() # 根据key对totals进行降序排列 return totals.sort_values(ascending=False)[n:] #根据职业和雇主进行聚合 grouped=fec_mrbo.groupby('cand_nm') grouped.apply(get_top_amounts,'contbr_occupation',n=7) Out[56]: cand_nm contbr_occupation Obama, Barack PROFESSOR 2165071.08 CEO 2073284.79 PRESIDENT 1878509.95 NOT EMPLOYED 1709188.20 EXECUTIVE 1355161.05 TEACHER 1250969.15 WRITER 1084188.88 OWNER 1001567.36 ENGINEER 951525.55 INVESTOR 884133.00 ARTIST 763125.00 MANAGER 762883.22 SELF-EMPLOYED 672393.40 STUDENT 628099.75 REAL ESTATE 528902.09 CHAIRMAN 496547.00 ARCHITECT 483859.89 DIRECTOR 471741.73 BUSINESS OWNER 449979.30 EDUCATOR 436600.89 PSYCHOLOGIST 427299.92 SOFTWARE ENGINEER 396985.65 PARTNER 395759.50 SALES 392886.91 EXECUTIVE DIRECTOR 348180.94 MANAGING DIRECTOR 329688.25 SOCIAL WORKER 326844.43 VICE PRESIDENT 325647.15 ADMINISTRATOR 323079.26 SCIENTIST 319227.88 Romney, Mitt NON-PROFIT VETERANS ORG. CHAIR/ANNUITA 10.00 PARAPLANNER 10.00 APPRAISAL 10.00 SIGN CONTRACTOR 10.00 POLITICAL OPERATIVE 10.00 PORT MGT 10.00 PRESIDENT EMERITUS 10.00 CONTRACTS SPECIALIST 9.00 TEACHER & FREE-LANCE JOURNALIST 9.00 FOUNDATION CONSULTANT 6.00 MAIL HANDLER 6.00 TREASURER & DIRECTOR OF FINANCE 6.00 SECRETARY/BOOKKEPPER 6.00 ELAYNE WELLS HARMER 6.00 CHICKEN GRADER 5.00 DIRECTOR REISCHAUER CENTER FOR EAST A 5.00 SCOTT GREENBAUM 5.00 EDUCATION ADMIN 5.00 ENGINEER/RISK EXPERT 5.00 PLANNING AND OPERATIONS ANALYST 5.00 VILLA NOVA 5.00 FINANCIAL INSTITUTION - CEO 5.00 HORTICULTURIST 5.00 MD - UROLOGIST 5.00 DISTRICT REPRESENTATIVE 5.00 INDEPENDENT PROFESSIONAL 3.00 REMODELER & SEMI RETIRED 3.00 AFFORDABLE REAL ESTATE DEVELOPER 3.00 IFC CONTRACTING SOLUTIONS 3.00 3RD GENERATION FAMILY BUSINESS OWNER 3.00 Name: contb_receipt_amt, dtype: float64
对出资额分组grouped.apply(get_top_amounts,'contbr_occupation',n=10) Out[57]: cand_nm contbr_occupation Obama, Barack NOT EMPLOYED 1709188.20 EXECUTIVE 1355161.05 TEACHER 1250969.15 WRITER 1084188.88 OWNER 1001567.36 ENGINEER 951525.55 INVESTOR 884133.00 ARTIST 763125.00 MANAGER 762883.22 SELF-EMPLOYED 672393.40 STUDENT 628099.75 REAL ESTATE 528902.09 CHAIRMAN 496547.00 ARCHITECT 483859.89 DIRECTOR 471741.73 BUSINESS OWNER 449979.30 EDUCATOR 436600.89 PSYCHOLOGIST 427299.92 SOFTWARE ENGINEER 396985.65 PARTNER 395759.50 SALES 392886.91 EXECUTIVE DIRECTOR 348180.94 MANAGING DIRECTOR 329688.25 SOCIAL WORKER 326844.43 VICE PRESIDENT 325647.15 ADMINISTRATOR 323079.26 SCIENTIST 319227.88 VOLUNTEER 305233.64 FINANCE 296031.40 MARKETING 263610.68 Romney, Mitt NON-PROFIT VETERANS ORG. CHAIR/ANNUITA 10.00 PARAPLANNER 10.00 APPRAISAL 10.00 SIGN CONTRACTOR 10.00 POLITICAL OPERATIVE 10.00 PORT MGT 10.00 PRESIDENT EMERITUS 10.00 CONTRACTS SPECIALIST 9.00 TEACHER & FREE-LANCE JOURNALIST 9.00 FOUNDATION CONSULTANT 6.00 MAIL HANDLER 6.00 TREASURER & DIRECTOR OF FINANCE 6.00 SECRETARY/BOOKKEPPER 6.00 ELAYNE WELLS HARMER 6.00 CHICKEN GRADER 5.00 DIRECTOR REISCHAUER CENTER FOR EAST A 5.00 SCOTT GREENBAUM 5.00 EDUCATION ADMIN 5.00 ENGINEER/RISK EXPERT 5.00 PLANNING AND OPERATIONS ANALYST 5.00 VILLA NOVA 5.00 FINANCIAL INSTITUTION - CEO 5.00 HORTICULTURIST 5.00 MD - UROLOGIST 5.00 DISTRICT REPRESENTATIVE 5.00 INDEPENDENT PROFESSIONAL 3.00 REMODELER & SEMI RETIRED 3.00 AFFORDABLE REAL ESTATE DEVELOPER 3.00 IFC CONTRACTING SOLUTIONS 3.00 3RD GENERATION FAMILY BUSINESS OWNER 3.00 Name: contb_receipt_amt, dtype: float64
import numpy as np bins=np.array([0,1,10,100,1000,10000,100000,1000000,10000000]) labels=pd.cut(fec_mrbo.contb_receipt_amt,bins) labels 411 (10, 100] 412 (100, 1000] 413 (100, 1000] 414 (10, 100] 415 (10, 100] 416 (10, 100] 417 (100, 1000] 418 (10, 100] 419 (100, 1000] 420 (10, 100] 421 (10, 100] 422 (100, 1000] 423 (100, 1000] 424 (100, 1000] 425 (100, 1000] 426 (100, 1000] 427 (1000, 10000] 428 (100, 1000] 429 (100, 1000] 430 (10, 100] 431 (1000, 10000] 432 (100, 1000] 433 (100, 1000] 434 (100, 1000] 435 (100, 1000] 436 (100, 1000] 437 (10, 100] 438 (100, 1000] 439 (100, 1000] 440 (10, 100] 701356 (10, 100] 701357 (1, 10] 701358 (10, 100] 701359 (10, 100] 701360 (10, 100] 701361 (10, 100] 701362 (100, 1000] 701363 (10, 100] 701364 (10, 100] 701365 (10, 100] 701366 (10, 100] 701367 (10, 100] 701368 (100, 1000] 701369 (10, 100] 701370 (10, 100] 701371 (10, 100] 701372 (10, 100] 701373 (10, 100] 701374 (10, 100] 701375 (10, 100] 701376 (1000, 10000] 701377 (10, 100] 701378 (10, 100] 701379 (100, 1000] 701380 (1000, 10000] 701381 (10, 100] 701382 (100, 1000] 701383 (1, 10] 701384 (10, 100] 701385 (100, 1000] Name: contb_receipt_amt, dtype: category Categories (8, object): [(0, 1] < (1, 10] < (10, 100] < (100, 1000] < (1000, 10000] < (10000, 100000] < (100000, 1000000] < (1000000, 10000000]] #根据候选人姓名以及面元标签对数据进行分组 grouped=fec_mrbo.groupby(['cand_nm',labels]) grouped.size().unstack(0) Out[67]: cand_nm Obama, Barack Romney, Mitt contb_receipt_amt (0, 1] 493.0 77.0 (1, 10] 40070.0 3681.0 (10, 100] 372280.0 31853.0 (100, 1000] 153991.0 43357.0 (1000, 10000] 22284.0 26186.0 (10000, 100000] 2.0 1.0 (100000, 1000000] 3.0 NaN (1000000, 10000000] 4.0 NaN #对出资额求合并在面元内规格化,以便图形化显示两位候选人各种赞助额度 bucket_sums Out[69]: cand_nm Obama, Barack Romney, Mitt contb_receipt_amt (0, 1] 318.24 77.00 (1, 10] 337267.62 29819.66 (10, 100] 20288981.41 1987783.76 (100, 1000] 54798531.46 22363381.69 (1000, 10000] 51753705.67 63942145.42 (10000, 100000] 59100.00 12700.00 (100000, 1000000] 1490683.08 NaN (1000000, 10000000] 7148839.76 NaN
normed_sums[:-2].plot(kind='barh',stacked=True)normed_sums=bucket_sums.div(bucket_sums.sum(axis=1),axis=0) normed_sums Out[71]: cand_nm Obama, Barack Romney, Mitt contb_receipt_amt (0, 1] 0.805182 0.194818 (1, 10] 0.918767 0.081233 (10, 100] 0.910769 0.089231 (100, 1000] 0.710176 0.289824 (1000, 10000] 0.447326 0.552674 (10000, 100000] 0.823120 0.176880 (100000, 1000000] 1.000000 NaN (1000000, 10000000] 1.000000 NaN
#根据州统计赞助信息