pandas数据分析
http://www.fec.gov/disclosurep/PDownload.do 总统数据下载
-
fec.cand_nm.unique() 统计所有人名列表
-
fec[‘party’] = fec.cand_nm.map(parties) 把parties字典中和cand_nm对应的放在新的一列party中,map操作
-
fec[‘party’].value_counts() 统计列party中的每个value的总数
-
fec.loc[:,[‘party’,‘cand_nm’]] 取指定两列
-
(fec.contb_receipt_amt > 0).value_counts() 判断返回布尔值,并对二值统计
-
fec = fec[fec.contb_receipt_amt > 0] 限定该数据集出资大于零的
-
fec_mrbo = fec[fec.cand_nm.isin([‘Obama, Barack’,‘Romney, Mitt’])]
把指定列中的满足条件的数据重新生成新的dataframe -
fec.contbr_occupation.value_counts() 出资人统计,默认重大到小排序
-
f = lambda x: occ_mapping.get(x,x)
存在字典中的返回字典的value,不存在返回原值
fec.contbr_occupation = fec.contbr_occupation.map(f) series循环操作value
map方法只能作用在series中 -
over_2mm = by_occupation[by_occupation.sum(1) > 2000000]#过滤出资金超两百万的
-
over_2mm.plot(kind=‘barh’)#横向条形
-
grouped 分组
def get_top_amounts(group,key,n=5):
totals = group.groupby(key)['contb_receipt_amt'].sum()
return totals.nlargest(n)# nlargest最大前n个
grouped = fec_mrbo.groupby('cand_nm') #根据职业和雇主进行聚合
grouped.apply(get_top_amounts,'contbr_occupation',n=7)
-
数据分组
bins=np.array([0,1,10,100,1000,10000,100000,1000000,10000000])
labels=pd.cut(fec_mrbo.contb_receipt_amt,bins) -
cand_nm为行名,labels为列名显示
grouped =fec_mrbo.groupby([‘cand_nm’,labels])
grouped.size().unstack(0) 花括号结构转表格结构 -
bucket_sums = grouped.contb_receipt_amt.sum().unstack(0)
normed_sums = bucket_sums.div(bucket_sums.sum(axis=1), axis=0)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
fec = pd.read_csv(r'C:\Users\Administrator\Desktop\pandas_01\fec\P00000001-ALL.csv')
unique_cands = fec.cand_nm.unique()#所有参选人
unique_cands
array(['Bachmann, Michelle', 'Romney, Mitt', 'Obama, Barack',
"Roemer, Charles E. 'Buddy' III", 'Pawlenty, Timothy',
'Johnson, Gary Earl', 'Paul, Ron', 'Santorum, Rick',
'Cain, Herman', 'Gingrich, Newt', 'McCotter, Thaddeus G',
'Huntsman, Jon', 'Perry, Rick'], dtype=object)
len(unique_cands)
13
parties = {'Bachmann, Michelle':'Republican',
'Cain, Herman':'Republican',
'Gingrich, Newt':'Republican',
'Huntsman, Jon':'Republican',
'Johnson, Gary Earl':'Republican',
'McCotter, Thaddeus G':'Republican',
'Obama, Barack':'Democrat',
'Paul, Ron':'Republican',
'Pawlenty, Timothy':'Republican',
'Perry, Rick':'Republican',
"Roemer, Charles E. 'Buddy' III":'Republican',
'Romney, Mitt':'Republican',
'Santorum, Rick':'Republican'}
parties
{'Bachmann, Michelle': 'Republican',
'Cain, Herman': 'Republican',
'Gingrich, Newt': 'Republican',
'Huntsman, Jon': 'Republican',
'Johnson, Gary Earl': 'Republican',
'McCotter, Thaddeus G': 'Republican',
'Obama, Barack': 'Democrat',
'Paul, Ron': 'Republican',
'Pawlenty, Timothy': 'Republican',
'Perry, Rick': 'Republican',
"Roemer, Charles E. 'Buddy' III": 'Republican',
'Romney, Mitt': 'Republican',
'Santorum, Rick': 'Republican'}
fec['party'] = fec.cand_nm.map(parties)
fec['party'].value_counts()
Democrat 593746
Republican 407985
Name: party, dtype: int64
fec.loc[:,['party','cand_nm']]
party | cand_nm | |
---|---|---|
0 | Republican | Bachmann, Michelle |
1 | Republican | Bachmann, Michelle |
2 | Republican | Bachmann, Michelle |
3 | Republican | Bachmann, Michelle |
4 | Republican | Bachmann, Michelle |
... | ... | ... |
1001726 | Republican | Perry, Rick |
1001727 | Republican | Perry, Rick |
1001728 | Republican | Perry, Rick |
1001729 | Republican | Perry, Rick |
1001730 | Republican | Perry, Rick |
1001731 rows × 2 columns
(fec.contb_receipt_amt > 0).value_counts()#赞助费,包含退款
True 991475
False 10256
Name: contb_receipt_amt, dtype: int64
fec = fec[fec.contb_receipt_amt > 0]#限定该数据集只能有正出资
fec_mrbo = fec[fec.cand_nm.isin(['Obama, Barack','Romney, Mitt'])]
fec.contbr_occupation.value_counts()[:10]#出资人出资统计
RETIRED 233990
INFORMATION REQUESTED 35107
ATTORNEY 34286
HOMEMAKER 29931
PHYSICIAN 23432
INFORMATION REQUESTED PER BEST EFFORTS 21138
ENGINEER 14334
TEACHER 13990
CONSULTANT 13273
PROFESSOR 12555
Name: contbr_occupation, dtype: int64
occ_mapping = {'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED',
'INFORMATION REQUESTED' : 'NOT PROVIDED',
'INFORMATION REQUESTED (BEST EFFORTS)' : 'NOT PROVIDED',
'C.E.O.': 'CEO'}
f = lambda x: occ_mapping.get(x,x)#存在字典中的返回字典的value,不存在返回原值
fec.contbr_occupation = fec.contbr_occupation.map(f)
emp_mapping = {'INFORMATION REQUESTED PER BEST EFFORTS' : 'NOT PROVIDED',
'INFORMATION REQUESTED' : 'NOT PROVIDED',
'SELF' : 'SELF-EMPLOYED',
'SELF EMPLOYED' : 'SELF-EMPLOYED',}#雇主信息
f = lambda x:emp_mapping.get(x,x)
fec.contbr_employer = fec.contbr_employer.map(f)
by_occupation = fec.pivot_table('contb_receipt_amt',index='contbr_occupation',columns='party',aggfunc='sum')
#contb_receipt_amt赞助费那列
over_2mm = by_occupation[by_occupation.sum(1) > 2000000]#过滤出资金超两百万的
over_2mm
party | Democrat | Republican |
---|---|---|
contbr_occupation | ||
ATTORNEY | 11141982.97 | 7.477194e+06 |
CEO | 2074974.79 | 4.211041e+06 |
CONSULTANT | 2459912.71 | 2.544725e+06 |
ENGINEER | 951525.55 | 1.818374e+06 |
EXECUTIVE | 1355161.05 | 4.138850e+06 |
HOMEMAKER | 4248875.80 | 1.363428e+07 |
INFORMATION REQUESTED PER BEST EFFORTS | NaN | 1.634053e+07 |
over_2mm.plot(kind='barh')#横向条形
<matplotlib.axes._subplots.AxesSubplot at 0x28a2e8604e0>
def get_top_amounts(group,key,n=5):
totals = group.groupby(key)['contb_receipt_amt'].sum()
return totals.nlargest(n)
grouped = fec_mrbo.groupby('cand_nm')#根据职业和雇主进行聚合
grouped.apply(get_top_amounts,'contbr_occupation',n=7)
cand_nm contbr_occupation
Obama, Barack RETIRED 25305116.38
ATTORNEY 11141982.97
INFORMATION REQUESTED 4866973.96
HOMEMAKER 4248875.80
PHYSICIAN 3735124.94
LAWYER 3160478.87
CONSULTANT 2459912.71
Romney, Mitt RETIRED 11508473.59
INFORMATION REQUESTED PER BEST EFFORTS 11396894.84
HOMEMAKER 8147446.22
ATTORNEY 5364718.82
PRESIDENT 2491244.89
EXECUTIVE 2300947.03
C.E.O. 1968386.11
Name: contb_receipt_amt, dtype: float64
grouped.apply(get_top_amounts,'contbr_employer',n=10)
cand_nm contbr_employer
Obama, Barack RETIRED 22694358.85
SELF-EMPLOYED 17080985.96
NOT EMPLOYED 8586308.70
INFORMATION REQUESTED 5053480.37
HOMEMAKER 2605408.54
SELF 1076531.20
SELF EMPLOYED 469290.00
STUDENT 318831.45
VOLUNTEER 257104.00
MICROSOFT 215585.36
Romney, Mitt INFORMATION REQUESTED PER BEST EFFORTS 12059527.24
RETIRED 11506225.71
HOMEMAKER 8147196.22
SELF-EMPLOYED 7409860.98
STUDENT 496490.94
CREDIT SUISSE 281150.00
MORGAN STANLEY 267266.00
GOLDMAN SACH & CO. 238250.00
BARCLAYS CAPITAL 162750.00
H.I.G. CAPITAL 139500.00
Name: contb_receipt_amt, dtype: float64
bins=np.array([0,1,10,100,1000,10000,100000,1000000,10000000])#出资额分组
labels=pd.cut(fec_mrbo.contb_receipt_amt,bins)
labels
411 (10, 100]
412 (100, 1000]
413 (100, 1000]
414 (10, 100]
415 (10, 100]
...
701381 (10, 100]
701382 (100, 1000]
701383 (1, 10]
701384 (10, 100]
701385 (100, 1000]
Name: contb_receipt_amt, Length: 694282, dtype: category
Categories (8, interval[int64]): [(0, 1] < (1, 10] < (10, 100] < (100, 1000] < (1000, 10000] < (10000, 100000] < (100000, 1000000] < (1000000, 10000000]]
grouped =fec_mrbo.groupby(['cand_nm',labels])
grouped.size().unstack(0)
cand_nm | Obama, Barack | Romney, Mitt |
---|---|---|
contb_receipt_amt | ||
(0, 1] | 493 | 77 |
(1, 10] | 40070 | 3681 |
(10, 100] | 372280 | 31853 |
(100, 1000] | 153991 | 43357 |
(1000, 10000] | 22284 | 26186 |
(10000, 100000] | 2 | 1 |
(100000, 1000000] | 3 | 0 |
(1000000, 10000000] | 4 | 0 |
bucket_sums = grouped.contb_receipt_amt.sum().unstack(0)
normed_sums = bucket_sums.div(bucket_sums.sum(axis=1), axis=0)
normed_sums
cand_nm | Obama, Barack | Romney, Mitt |
---|---|---|
contb_receipt_amt | ||
(0, 1] | 0.805182 | 0.194818 |
(1, 10] | 0.918767 | 0.081233 |
(10, 100] | 0.910769 | 0.089231 |
(100, 1000] | 0.710176 | 0.289824 |
(1000, 10000] | 0.447326 | 0.552674 |
(10000, 100000] | 0.823120 | 0.176880 |
(100000, 1000000] | 1.000000 | NaN |
(1000000, 10000000] | 1.000000 | NaN |
normed_sums[:-2].plot(kind='barh')#两位候选⼈收到的各种捐赠额度的总额⽐例
<matplotlib.axes._subplots.AxesSubplot at 0x28a35730550>
grouped = fec_mrbo.groupby(['cand_nm','contbr_st'])
totals = grouped.contb_receipt_amt.sum().unstack(0).fillna(0)#contb_receipt_amt出资金额
totals = totals[totals.sum(1) > 100000]
totals[:5]#根据州统计赞助信息
cand_nm | Obama, Barack | Romney, Mitt |
---|---|---|
contbr_st | ||
AK | 281840.15 | 86204.24 |
AL | 543123.48 | 527303.51 |
AR | 359247.28 | 105556.00 |
AZ | 1506476.98 | 1888436.23 |
CA | 23824984.24 | 11237636.60 |
percent = totals.div(totals.sum(1),axis=0)#在各州的比例sum(1)表示横向求和
percent[:5]#候选人在各州的总赞助额比率
cand_nm | Obama, Barack | Romney, Mitt |
---|---|---|
contbr_st | ||
AK | 0.765778 | 0.234222 |
AL | 0.507390 | 0.492610 |
AR | 0.772902 | 0.227098 |
AZ | 0.443745 | 0.556255 |
CA | 0.679498 | 0.320502 |