实例分析
14.5 2012联邦选举委员会数据库
import pandas as pd
import numpy as np
fec = pd.read_csv('datasets/fec/P00000001-ALL.csv')#导入数据
fec.info()#查看信息的相关数据描述
E:\anaconda\lib\site-packages\IPython\core\interactiveshell.py:3058: DtypeWarning: Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.
interactivity=interactivity, compiler=compiler, result=result)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001731 entries, 0 to 1001730
Data columns (total 16 columns):
cmte_id 1001731 non-null object
cand_id 1001731 non-null object
cand_nm 1001731 non-null object
contbr_nm 1001731 non-null object
contbr_city 1001712 non-null object
contbr_st 1001727 non-null object
contbr_zip 1001620 non-null object
contbr_employer 988002 non-null object
contbr_occupation 993301 non-null object
contb_receipt_amt 1001731 non-null float64
contb_receipt_dt 1001731 non-null object
receipt_desc 14166 non-null object
memo_cd 92482 non-null object
memo_text 97770 non-null object
form_tp 1001731 non-null object
file_num 1001731 non-null int64
dtypes: float64(1), int64(1), object(14)
memory usage: 68.8+ MB
fec.iloc[123456]
cmte_id C00431445
cand_id P80003338
cand_nm Obama, Barack
contbr_nm ELLMAN, IRA
contbr_city TEMPE
contbr_st AZ
contbr_zip 852816719
contbr_employer ARIZONA STATE UNIVERSITY
contbr_occupation PROFESSOR
contb_receipt_amt 50
contb_receipt_dt 01-DEC-11
receipt_desc NaN
memo_cd NaN
memo_text NaN
form_tp SA17A
file_num 772372
Name: 123456, dtype: object
unique_cands = fec.cand_nm.unique()#获取候选人名单
parties = {'Bachmann, Michelle': 'Republican',
'Cain, Herman': 'Republican',
'Gingrich, Newt': 'Republican',
'Huntsman, Jon': 'Republican',
'Johnson, Gary Earl': 'Republican',
'McCotter, Thaddeus G': 'Republican',
'Obama, Barack': 'Democrat',
'Paul, Ron': 'Republican',
'Pawlenty, Timothy': 'Republican',
'Perry, Rick': 'Republican',
"Roemer, Charles E. 'Buddy' III": 'Republican',
'Romney, Mitt': 'Republican',
'Santorum, Rick': 'Republican'}
#确定党派和人名对应字典
fec.cand_nm[123456:123461]
fec.cand_nm[123456:123461].map(parties)#map方法
fec.cand_nm[123456:123461].map(parties)
fec['party'] = fec.cand_nm.map(parties)#用于所有的数据
fec['party'].value_counts()
Democrat 593746
Republican 407985
Name: party, dtype: int64
(fec.contb_receipt_amt > 0).value_counts()#计算负出的资源
True 991475
False 10256
Name: contb_receipt_amt, dtype: int64
fec = fec[fec.contb_receipt_amt > 0]#限定出的资源为正的数据
fec_mrbo = fec[fec.cand_nm.isin(['Obama, Barack','Romney, Mitt'])]
#只统计两个人的竞选信息
根据职业和雇主统计赞助信息
通过信息分析,对比出每个数据行业对政党的倾向性。
fec.contbr_occupation.value_counts()[:10]#根据职业计算出出资额
RETIRED 233990
INFORMATION REQUESTED 35107
ATTORNEY 34286
HOMEMAKER 29931
PHYSICIAN 23432
INFORMATION REQUESTED PER BEST EFFORTS 21138
ENGINEER 14334
TEACHER 13990
CONSULTANT 13273
PROFESSOR 12555
Name: contbr_occupation, dtype: int64
#将一个职业信息映射到另一个职业信息
occ_mapping = {
'INFORMATION REQUESTED PER BEST EFFORTS' : 'NOT PROVIDED',
'INFORMATION REQUESTED' : 'NOT PROVIDED',
'INFORMATION REQUESTED (BEST EFFORTS)' : 'NOT PROVIDED',
'C.E.O.': 'CEO'
}
f = lambda x: occ_mapping.get(x, x)
fec.contbr_occupation = fec.contbr_occupation.map(f)
#对雇主信息做相同处理
emp_mapping = {
'INFORMATION REQUESTED PER BEST EFFORTS' : 'NOT PROVIDED',
'INFORMATION REQUESTED' : 'NOT PROVIDED',
'SELF' : 'SELF-EMPLOYED',
'SELF EMPLOYED' : 'SELF-EMPLOYED',
}
f = lambda x: emp_mapping.get(x, x)
fec.contbr_employer = fec.contbr_employer.map(f)
通过pivot_table对信息进行聚合,然后过滤掉出资不足200万美元的数据。
by_occupation = fec.pivot_table('contb_receipt_amt',
index='contbr_occupation',
columns='party', aggfunc='sum')
over_2mm = by_occupation[by_occupation.sum(1) > 2000000]
print(over_2mm)
party Democrat Republican
contbr_occupation
ATTORNEY 11141982.97 7.477194e+06
CEO 2074974.79 4.211041e+06
CONSULTANT 2459912.71 2.544725e+06
ENGINEER 951525.55 1.818374e+06
EXECUTIVE 1355161.05 4.138850e+06
HOMEMAKER 4248875.80 1.363428e+07
INVESTOR 884133.00 2.431769e+06
LAWYER 3160478.87 3.912243e+05
MANAGER 762883.22 1.444532e+06
NOT PROVIDED 4866973.96 2.056547e+07
OWNER 1001567.36 2.408287e+06
PHYSICIAN 3735124.94 3.594320e+06
PRESIDENT 1878509.95 4.720924e+06
PROFESSOR 2165071.08 2.967027e+05
REAL ESTATE 528902.09 1.625902e+06
RETIRED 25305116.38 2.356124e+07
SELF-EMPLOYED 672393.40 1.640253e+06
over_2mm.plot(kind='barh')#画出框图
<matplotlib.axes._subplots.AxesSubplot at 0x752e710>
def get_top_amounts(group, key, n=5):
totals = group.groupby(key)['contb_receipt_amt'].sum()
return totals.nlargest(n)
#统计出资最多的职业和企业
grouped = fec_mrbo.groupby('cand_nm')
grouped.apply(get_top_amounts, 'contbr_occupation', n=7)
cand_nm contbr_occupation
Obama, Barack RETIRED 25305116.38
ATTORNEY 11141982.97
INFORMATION REQUESTED 4866973.96
HOMEMAKER 4248875.80
PHYSICIAN 3735124.94
LAWYER 3160478.87
CONSULTANT 2459912.71
Romney, Mitt RETIRED 11508473.59
INFORMATION REQUESTED PER BEST EFFORTS 11396894.84
HOMEMAKER 8147446.22
ATTORNEY 5364718.82
PRESIDENT 2491244.89
EXECUTIVE 2300947.03
C.E.O. 1968386.11
Name: contb_receipt_amt, dtype: float64
grouped.apply(get_top_amounts, 'contbr_employer', n=10)
cand_nm contbr_employer
Obama, Barack RETIRED 22694358.85
SELF-EMPLOYED 17080985.96
NOT EMPLOYED 8586308.70
INFORMATION REQUESTED 5053480.37
HOMEMAKER 2605408.54
SELF 1076531.20
SELF EMPLOYED 469290.00
STUDENT 318831.45
VOLUNTEER 257104.00
MICROSOFT 215585.36
Romney, Mitt INFORMATION REQUESTED PER BEST EFFORTS 12059527.24
RETIRED 11506225.71
HOMEMAKER 8147196.22
SELF-EMPLOYED 7409860.98
STUDENT 496490.94
CREDIT SUISSE 281150.00
MORGAN STANLEY 267266.00
GOLDMAN SACH & CO. 238250.00
BARCLAYS CAPITAL 162750.00
H.I.G. CAPITAL 139500.00
Name: contb_receipt_amt, dtype: float64
对出资额分组
可以利用cut函数根据出资额将数据离散化到多个面元中
bins = np.array([0, 1, 10, 100, 1000, 10000,
100000, 1000000, 10000000])
labels = pd.cut(fec_mrbo.contb_receipt_amt, bins)
labels
411 (10, 100]
412 (100, 1000]
413 (100, 1000]
414 (10, 100]
415 (10, 100]
...
701381 (10, 100]
701382 (100, 1000]
701383 (1, 10]
701384 (10, 100]
701385 (100, 1000]
Name: contb_receipt_amt, Length: 694282, dtype: category
Categories (8, interval[int64]): [(0, 1] < (1, 10] < (10, 100] < (100, 1000] < (1000, 10000] < (10000, 100000] < (100000, 1000000] < (1000000, 10000000]]
grouped = fec_mrbo.groupby(['cand_nm', labels])
print(grouped.size().unstack(0))
cand_nm Obama, Barack Romney, Mitt
contb_receipt_amt
(0, 1] 493.0 77.0
(1, 10] 40070.0 3681.0
(10, 100] 372280.0 31853.0
(100, 1000] 153991.0 43357.0
(1000, 10000] 22284.0 26186.0
(10000, 100000] 2.0 1.0
(100000, 1000000] 3.0 NaN
(1000000, 10000000] 4.0 NaN
#可视化显示相关信息
bucket_sums = grouped.contb_receipt_amt.sum().unstack(0)
normed_sums = bucket_sums.div(bucket_sums.sum(axis=1), axis=0)
normed_sums[:-2].plot(kind='barh')
<matplotlib.axes._subplots.AxesSubplot at 0x7901ef0>
根据州统计赞助信息
然后按着地域进行统计分析
grouped = fec_mrbo.groupby(['cand_nm', 'contbr_st'])
totals = grouped.contb_receipt_amt.sum().unstack(0).fillna(0)
totals = totals[totals.sum(1) > 100000]
percent = totals.div(totals.sum(1), axis=0)#统计总出资额度,且各行再除以总数
print(percent[:10])
cand_nm Obama, Barack Romney, Mitt
contbr_st
AK 0.765778 0.234222
AL 0.507390 0.492610
AR 0.772902 0.227098
AZ 0.443745 0.556255
CA 0.679498 0.320502
CO 0.585970 0.414030
CT 0.371476 0.628524
DC 0.810113 0.189887
DE 0.802776 0.197224
FL 0.467417 0.532583
说明:
放上参考链接,这个系列都是复现的这个链接中的内容。
放上原链接: https://www.jianshu.com/p/04d180d90a3f
作者在链接中放上了书籍,以及相关资源。因为平时杂七杂八的也学了一些,所以这次可能是对书中的部分内容的复现。也可能有我自己想到的内容,内容暂时都还不定。在此感谢原简书作者SeanCheney的分享。