#-*- encoding:utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import Series,DataFrame
#加载数据,150M+,用时8.7s
fec = pd.read_csv('E:\\P00000001-ALL.csv')
print fec,'\n'
print fec.ix[123456],'\n'
#下面介绍几种不同的分析方法
#通过unique,你可以获取全部的候选人名单
unique_cands = fec.cand_nm.unique()
print unique_cands,'\n'
#下面将候选人和党派对应起来,额,写了半天,奥巴马是Democrat(民主党),其他人都是共和党……
parties = {'Bachmann, Michelle':'Republican',
'Cain, Herman':'Republican',
'Gingrich, Newt':'Republican',
'Huntsman, Jon':'Republican',
'Johnson, Gary Earl':'Republican',
'McCotter, Thaddeus G':'Republican',
'Obama, Barack':'Democrat',
'Paul, Ron':'Republican',
'Pawlenty, Timothy':'Republican',
'Perry, Rick':'Republican',
"Roemer, Charles E. 'Buddy' III":'Republican',
'Romney, Mitt':'Republican',
'Santorum, Rick':'Republican'}
#为其添加新列
fec['party'] = fec.cand_nm.map(parties)
print fec['party'].value_counts(),'\n'
#注意,这份数据既包括赞助也包括退款
print (fec.contb_receipt_amt > 0).value_counts(),'\n'
#为了简便,这里将只研究正出资额的部分
fec = fec[fec.contb_receipt_amt > 0]
#专门准备两个子集盛放奥巴马和Mitt Romney
fec_mrbo = fec[fec.cand_nm.isin(['Obama, Barack','Romney, Mitt'])]
#根据职业和雇主统计赞助信息,例如律师倾向于赞助民主党,企业主倾向于自主共和党
#下面看一下职业
print fec.contbr_occupation.value_counts()[:10],'\n'
#下面将这些职业进行一些处理(将一个职业信息映射到另一个)
occ_mapping = {
'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED',
'INFORMATION REQUESTED':'NOT PROVIDED',
'INFORMATION REQUESTED (BEST EFFORTS)':'NOT PROVIDED',
'C.E.O':'CEO'
}
#下面用了一个dict.get,下面的get第一个x是dict的键,映射到返回对应的key,第二个是没有映射到返回的内容,如果没有提供映射的话,返回x
f = lambda x:occ_mapping.get(x,x)
fec.contbr_occupation = fec.contbr_occupation.map(f)
#对雇主的信息也这样处理一下
emp_mapping = {
'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED',
'INFORMATION REQUESTED':'NOT PROVIDED',
'SELF':'SELF-EMPLOYED',
'SELF EMPLOYED':'SELF-EMPLOYED'
}
f = lambda x:emp_mapping.get(x,x)
fec.contbr_employer = fec.contbr_employer.map(f)
#下面可以通过pivot_table根据党派和职业对数据进行聚合,然后过滤掉出资总额不足200万美元的数据
by_occupation = fec.pivot_table('contb_receipt_amt',rows = 'contbr_occupation',cols = 'party',aggfunc = sum)
print by_occupation.head(),'\n' #这个数据一定要看一下
over_2mm = by_occupation[by_occupation.sum(1) > 2000000]
print over_2mm
over_2mm.plot(kind = 'barh')
plt.show()
#你可能还想了解一下对OBAMA和ROMNEY总出资额最高的职业和企业,想法是先分组,然后再选取
def get_top_amounts(group,key,n = 5):
totals = group.groupby(key)['contb_receipt_amt'].sum()
return totals.order(ascending = False)[:n] #作者书上写错了
grouped = fec_mrbo.groupby('cand_nm')
#下面的语句是说,grouped对象可以被进一步groupby
print grouped.apply(get_top_amounts,'contbr_occupation',n = 7),'\n'
print fec_mrbo.groupby(['cand_nm','contbr_occupation'])['contb_receipt_amt'].sum(),'\n' #不知道这里为啥不对……,为什么跟前面的语句结果不一样?……
#print fec_mrbo.pivot_table('contb_receipt_amt',rows = ['cand_nm','contbr_occupation'],aggfunc = 'sum')
print grouped.apply(get_top_amounts,'contbr_employer',n = 10)
>>>
Int64Index: 1001731 entries, 0 to 1001730
Data columns:
cmte_id 1001731 non-null values
cand_id 1001731 non-null values
cand_nm 1001731 non-null values
contbr_nm 1001731 non-null values
contbr_city 1001716 non-null values
contbr_st 1001727 non-null values
contbr_zip 1001620 non-null values
contbr_employer 994314 non-null values
contbr_occupation 994433 non-null values
contb_receipt_amt 1001731 non-null values
contb_receipt_dt 1001731 non-null values
receipt_desc 14166 non-null values
memo_cd 92482 non-null values
memo_text 97770 non-null values
form_tp 1001731 non-null values
file_num 1001731 non-null values
dtypes: float64(1), int64(1), object(14)
cmte_id C00431445
cand_id P80003338
cand_nm Obama, Barack
contbr_nm ELLMAN, IRA
contbr_city TEMPE
contbr_st AZ
contbr_zip 852816719
contbr_employer ARIZONA STATE UNIVERSITY
contbr_occupation PROFESSOR
contb_receipt_amt 50
contb_receipt_dt 01-DEC-11
receipt_desc NaN
memo_cd NaN
memo_text NaN
form_tp SA17A
file_num 772372
Name: 123456
[Bachmann, Michelle Romney, Mitt Obama, Barack
Roemer, Charles E. 'Buddy' III Pawlenty, Timothy Johnson, Gary Earl
Paul, Ron Santorum, Rick Cain, Herman Gingrich, Newt McCotter, Thaddeus G
Huntsman, Jon Perry, Rick]
Democrat 593746
Republican 407985
True 991475
False 10256
RETIRED 233990
INFORMATION REQUESTED 35107
ATTORNEY 34286
HOMEMAKER 29931
PHYSICIAN 23432
INFORMATION REQUESTED PER BEST EFFORTS 21138
ENGINEER 14334
TEACHER 13990
CONSULTANT 13273
PROFESSOR 12555
party Democrat Republican
contbr_occupation
MIXED-MEDIA ARTIST / STORYTELLER 100 NaN
AREA VICE PRESIDENT 250 NaN
RESEARCH ASSOCIATE 100 NaN
TEACHER 500 NaN
THERAPIST 3900 NaN
party Democrat Republican
contbr_occupation
ATTORNEY 11141982.97 7477194.430000
C.E.O. 1690.00 2592983.110000
CEO 2074284.79 1640758.410000
CONSULTANT 2459912.71 2544725.450000
ENGINEER 951525.55 1818373.700000
EXECUTIVE 1355161.05 4138850.090000
HOMEMAKER 4248875.80 13634275.780000
INVESTOR 884133.00 2431768.920000
LAWYER 3160478.87 391224.320000
MANAGER 762883.22 1444532.370000
NOT PROVIDED 4866973.96 20565473.010000
OWNER 1001567.36 2408286.920000
PHYSICIAN 3735124.94 3594320.240000
PRESIDENT 1878509.95 4720923.760000
PROFESSOR 2165071.08 296702.730000
REAL ESTATE 528902.09 1625902.250000
RETIRED 25305116.38 23561244.489999
SELF-EMPLOYED 672393.40 1640252.540000
cand_nm contbr_occupation
Obama, Barack RETIRED 25305116.38
ATTORNEY 11141982.97
INFORMATION REQUESTED 4866973.96
HOMEMAKER 4248875.80
PHYSICIAN 3735124.94
LAWYER 3160478.87
CONSULTANT 2459912.71
Romney, Mitt RETIRED 11508473.59
INFORMATION REQUESTED PER BEST EFFORTS 11396894.84
HOMEMAKER 8147446.22
ATTORNEY 5364718.82
PRESIDENT 2491244.89
EXECUTIVE 2300947.03
C.E.O. 1968386.11
Name: contb_receipt_amt
cand_nm contbr_occupation
Obama, Barack MIXED-MEDIA ARTIST / STORYTELLER 100
AREA VICE PRESIDENT 250
RESEARCH ASSOCIATE 100
TEACHER 500
THERAPIST 3900
- 5000
.NET PROGRAMMER 481
07/13/1972 98
12K ADVOCATE 150
13D 721
1SG RETIRED 210
1ST ASSISTANT DIRECTOR 2ND UNIT 35
1ST GRADE TEACHER 435
1ST VP WEALTH MANAGEMENT 559
22ND CENTURY REALTY 500
...
Romney, Mitt WRITER/ MUSIC PRODUCER 100
WRITER/AUTHOR 2500
WRITER/EDITOR 350
WRITER/INVESTOR 25
WRITER/MEDIA PRODUCER 300
WRITER/PRODUCER 225
WRITER/TRAINER 35
WUNDERMAN 1000
YACHT BUILDER 2500
YACHT CAPTAIN 500
YACHT CONSTRUCTION 2500
YOGA INSTRUCTOR 500
YOGA TEACHER 2500
YOUTH CARE WORKER 25
YOUTH OUTREACH DIRECTOR 1000
Name: contb_receipt_amt, Length: 35991
cand_nm contbr_employer
Obama, Barack RETIRED 22694358.85
SELF-EMPLOYED 17080985.96
NOT EMPLOYED 8586308.70
INFORMATION REQUESTED 5053480.37
HOMEMAKER 2605408.54
SELF 1076531.20
SELF EMPLOYED 469290.00
STUDENT 318831.45
VOLUNTEER 257104.00
MICROSOFT 215585.36
Romney, Mitt INFORMATION REQUESTED PER BEST EFFORTS 12059527.24
RETIRED 11506225.71
HOMEMAKER 8147196.22
SELF-EMPLOYED 7409860.98
STUDENT 496490.94
CREDIT SUISSE 281150.00
MORGAN STANLEY 267266.00
GOLDMAN SACH & CO. 238250.00
BARCLAYS CAPITAL 162750.00
H.I.G. CAPITAL 139500.00
Name: contb_receipt_amt
[Finished in 16.6s]