1. 数据载入和总览
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
1.1 数据载入
df1 = pd.read_csv('./usa_elect/contb_01.csv')
df2 = pd.read_csv('./usa_elect/contb_02.csv')
df3 = pd.read_csv('./usa_elect/contb_03.csv')
df1.head()
cand_nm | contbr_nm | contbr_st | contbr_employer | contbr_occupation | contb_receipt_amt | contb_receipt_dt | |
---|---|---|---|---|---|---|---|
0 | Bachmann, Michelle | HARVEY, WILLIAM | AL | RETIRED | RETIRED | 250.0 | 20-JUN-11 |
1 | Bachmann, Michelle | HARVEY, WILLIAM | AL | RETIRED | RETIRED | 50.0 | 23-JUN-11 |
2 | Bachmann, Michelle | SMITH, LANIER | AL | INFORMATION REQUESTED | INFORMATION REQUESTED | 250.0 | 05-JUL-11 |
3 | Bachmann, Michelle | BLEVINS, DARONDA | AR | NONE | RETIRED | 250.0 | 01-AUG-11 |
4 | Bachmann, Michelle | WARDENBURG, HAROLD | AR | NONE | RETIRED | 300.0 | 20-JUN-11 |
df3.head()
cand_nm | contbr_nm | contbr_st | contbr_employer | contbr_occupation | contb_receipt_amt | contb_receipt_dt | |
---|---|---|---|---|---|---|---|
0 | Perry, Rick | NORRIS, BOBBY | TX | SELF | REALTOR/ RANCHER | 1000.0 | 14-DEC-11 |
1 | Perry, Rick | HUNT, CAROLINE | TX | HOMEMAKER | HOMEMAKER | 2500.0 | 14-DEC-11 |
2 | Perry, Rick | ROBINOWITZ, DALE | TX | SELF-EMPLOYED | DENTIST | 2500.0 | 14-DEC-11 |
3 | Perry, Rick | ZUNIGA, DAVID & SYLVIA | TX | DAVID M. ZUNIGA PE | OWNER | 250.0 | 14-DEC-11 |
4 | Perry, Rick | FOSTER, GLENNA | TX | SELF | INVESTOR/ CONSULTANT | 1500.0 | 14-DEC-11 |
1.2 数据合并
contb = pd.concat([df1,df2,df3],axis = 0,ignore_index=True)
contb.shape
(1001733, 7)
1.3 数据预览和基本统计分析
contb.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001733 entries, 0 to 1001732
Data columns (total 7 columns):
cand_nm 1001733 non-null object
contbr_nm 1001733 non-null object
contbr_st 1001729 non-null object
contbr_employer 988004 non-null object
contbr_occupation 993303 non-null object
contb_receipt_amt 1001733 non-null float64
contb_receipt_dt 1001733 non-null object
dtypes: float64(1), object(6)
memory usage: 53.5+ MB
contb.fillna(value='NOT PROVIDE',inplace=True)
contb.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001733 entries, 0 to 1001732
Data columns (total 7 columns):
cand_nm 1001733 non-null object
contbr_nm 1001733 non-null object
contbr_st 1001733 non-null object
contbr_employer 1001733 non-null object
contbr_occupation 1001733 non-null object
contb_receipt_amt 1001733 non-null float64
contb_receipt_dt 1001733 non-null object
dtypes: float64(1), object(6)
memory usage: 53.5+ MB
# 献金,给总统钱
contb.describe()
contb_receipt_amt | |
---|---|
count | 1.001733e+06 |
mean | 2.982358e+02 |
std | 3.749663e+03 |
min | -3.080000e+04 |
25% | 3.500000e+01 |
50% | 1.000000e+02 |
75% | 2.500000e+02 |
max | 2.014491e+06 |
2. 数据清洗
2.1 缺失值处理
填充:NOT PROVIDE
# 职位,公司,州
# 填充了NOT PROVIDE 无可奉告
# 删除,也是一种选择
2.2 数据转换
利用字典映射进行转换:党派分析
#通过搜索引擎等途径,获取到每个总统候选人的所属党派,建立字典parties,候选人名字作为键,所属党派作为对应的值
parties = {'Bachmann, Michelle': 'Republican',
'Cain, Herman': 'Republican',
'Gingrich, Newt': 'Republican',
'Huntsman, Jon': 'Republican',
'Johnson, Gary Earl': 'Republican',
'McCotter, Thaddeus G': 'Republican',
'Obama, Barack': 'Democrat',
'Paul, Ron': 'Republican',
'Pawlenty, Timothy': 'Republican',
'Perry, Rick': 'Republican',
"Roemer, Charles E. 'Buddy' III": 'Republican',
'Romney, Mitt': 'Republican',
'Santorum, Rick': 'Republican'}
contb.head()
cand_nm | contbr_nm | contbr_st | contbr_employer | contbr_occupation | contb_receipt_amt | contb_receipt_dt | |
---|---|---|---|---|---|---|---|
0 | Bachmann, Michelle | HARVEY, WILLIAM | AL | RETIRED | RETIRED | 250.0 | 20-JUN-11 |
1 | Bachmann, Michelle | HARVEY, WILLIAM | AL | RETIRED | RETIRED | 50.0 | 23-JUN-11 |
2 | Bachmann, Michelle | SMITH, LANIER | AL | INFORMATION REQUESTED | INFORMATION REQUESTED | 250.0 | 05-JUL-11 |
3 | Bachmann, Michelle | BLEVINS, DARONDA | AR | NONE | RETIRED | 250.0 | 01-AUG-11 |
4 | Bachmann, Michelle | WARDENBURG, HAROLD | AR | NONE | RETIRED | 300.0 | 20-JUN-11 |
map方法添加party属性
# replace,map,transform,apply都可以实现
contb['party'] = contb['cand_nm'].map(parties)
contb.head()
cand_nm | contbr_nm | contbr_st | contbr_employer | contbr_occupation | contb_receipt_amt | contb_receipt_dt | party | |
---|---|---|---|---|---|---|---|---|
0 | Bachmann, Michelle | HARVEY, WILLIAM | AL | RETIRED | RETIRED | 250.0 | 20-JUN-11 | Republican |
1 | Bachmann, Michelle | HARVEY, WILLIAM | AL | RETIRED | RETIRED | 50.0 | 23-JUN-11 | Republican |
2 | Bachmann, Michelle | SMITH, LANIER | AL | INFORMATION REQUESTED | INFORMATION REQUESTED | 250.0 | 05-JUL-11 | Republican |
3 | Bachmann, Michelle | BLEVINS, DARONDA | AR | NONE | RETIRED | 250.0 | 01-AUG-11 | Republican |
4 | Bachmann, Michelle | WARDENBURG, HAROLD | AR | NONE | RETIRED | 300.0 | 20-JUN-11 | Republican |
value_counts统计各党派总捐频次
contb['party'].value_counts()
Democrat 593747
Republican 407986
Name: party, dtype: int64
contb.groupby(by = 'party')['contb_receipt_amt'].sum()
party
Democrat 1.335028e+08
Republican 1.652498e+08
Name: contb_receipt_amt, dtype: float64
排序:按照职业汇总对赞助总金额进行排序
occupation_ = contb.groupby(by = 'contbr_occupation')['contb_receipt_amt'].sum()
occupation_
contbr_occupation
MIXED-MEDIA ARTIST / STORYTELLER 100.0
AREA VICE PRESIDENT 250.0
RESEARCH ASSOCIATE 100.0
TEACHER 500.0
THERAPIST 3900.0
...
ZOOKEEPER 35.0
ZOOLOGIST 400.0
ZOOLOGY EDUCATION 25.0
\NONE\ 250.0
~ 75.0
Name: contb_receipt_amt, Length: 45074, dtype: float64
occupation_.sort_values(ascending=False)[:50]
contbr_occupation
RETIRED 48176647.00
ATTORNEY 18470473.30
HOMEMAKER 17484807.65
INFORMATION REQUESTED PER BEST EFFORTS 15859514.55
INFORMATION REQUESTED 8742357.59
PHYSICIAN 7224044.40
PRESIDENT 6347843.59
EXECUTIVE 5273717.90
CONSULTANT 4932627.98
NOT PROVIDE 4224760.39
CEO 3570942.20
LAWYER 3537982.19
OWNER 3278488.16
INVESTOR 3204481.92
ENGINEER 2730527.43
PROFESSOR 2458033.81
C.E.O. 2433218.11
SELF-EMPLOYED 2259150.94
MANAGER 2167571.47
REAL ESTATE 2110499.34
SALES 1814901.82
NOT EMPLOYED 1752927.93
BUSINESS OWNER 1736511.73
TEACHER 1709754.05
CHAIRMAN 1691595.37
STUDENT 1679435.28
FINANCE 1664021.31
BANKER 1462903.13
WRITER 1303267.29
PARTNER 1177656.55
VICE PRESIDENT 1177600.41
INVESTMENTS 1120947.94
MANAGING DIRECTOR 969128.57
INVESTMENT BANKER 961297.90
ARTIST 959387.57
DIRECTOR 855121.89
SOFTWARE ENGINEER 836558.02
ACCOUNTANT 828374.46
FARMER 747042.89
DENTIST 741347.59
FINANCIAL ADVISOR 660786.63
ARCHITECT 656140.41
PRESIDENT & C.E.O. 642686.28
CPA 606266.64
INVESTMENT MANAGER 601899.60
INVESTMENT MANAGEMENT 583189.02
REAL ESTATE DEVELOPER 578934.25
PRINCIPAL 569543.44
REQUESTED 567872.46
PRIVATE EQUITY 555939.00
Name: contb_receipt_amt, dtype: float64
按照职位进行汇总,计算赞助总金额,展示前50项,发现不少职业是相同的,只不过是表达不一样而已,如C.E.O.与CEO,都是一个职业
# 职位重合,替换合并
occupation = {'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDE',
'INFORMATION REQUESTED':'NOT PROVIDE',
'C.E.O.':'CEO',
'LAWYER':'ATTORNEY',
'PRESIDENT & C.E.O.':'CEO'
}
print(d.get('李四'))
None
def convert(x):
if x == '张三':
return 'zhang sir'
# map,遍历所有, 遍历所有
df1['name'] = df1['name'].transform(convert)
df1
name | |
---|---|
0 | zhang sir |
1 | None |
2 | None |
3 | None |
# map 原理
df1 = DataFrame({'name':['张三','李四','王五','Jack']})
d = {'张三':'zhang sir'}
# df1['name'] = df1['name'].apply(d)
df1
name | |
---|---|
0 | 张三 |
1 | 李四 |
2 | 王五 |
3 | Jack |
# 原数据的职位这一列,替换,一一找到,相同的职位划归到一个
# map:将里面的所有的数据进行替换;
# replace:替换其中一部分
contb['contbr_occupation'] = contb['contbr_occupation'].replace(occupation)
contb
cand_nm | contbr_nm | contbr_st | contbr_employer | contbr_occupation | contb_receipt_amt | contb_receipt_dt | party | |
---|---|---|---|---|---|---|---|---|
0 | Bachmann, Michelle | HARVEY, WILLIAM | AL | RETIRED | RETIRED | 250.0 | 20-JUN-11 | Republican |
1 | Bachmann, Michelle | HARVEY, WILLIAM | AL | RETIRED | RETIRED | 50.0 | 23-JUN-11 | Republican |
2 | Bachmann, Michelle | SMITH, LANIER | AL | INFORMATION REQUESTED | NOT PROVIDE | 250.0 | 05-JUL-11 | Republican |
3 | Bachmann, Michelle | BLEVINS, DARONDA | AR | NONE | RETIRED | 250.0 | 01-AUG-11 | Republican |
4 | Bachmann, Michelle | WARDENBURG, HAROLD | AR | NONE | RETIRED | 300.0 | 20-JUN-11 | Republican |
... | ... | ... | ... | ... | ... | ... | ... | ... |
1001728 | Perry, Rick | GORMAN, CHRIS D. MR. | XX | INFORMATION REQUESTED PER BEST EFFORTS | NOT PROVIDE | 5000.0 | 29-SEP-11 | Republican |
1001729 | Perry, Rick | DUFFY, DAVID A. MR. | XX | DUFFY EQUIPMENT COMPANY INC. | BUSINESS OWNER | 2500.0 | 30-SEP-11 | Republican |
1001730 | Perry, Rick | GRANE, BRYAN F. MR. | XX | INFORMATION REQUESTED PER BEST EFFORTS | NOT PROVIDE | 500.0 | 29-SEP-11 | Republican |
1001731 | Perry, Rick | TOLBERT, DARYL MR. | XX | T.A.C.C. | LONGWALL MAINTENANCE FOREMAN | 500.0 | 30-SEP-11 | Republican |
1001732 | Perry, Rick | ANDERSON, MARILEE MRS. | XX | INFORMATION REQUESTED PER BEST EFFORTS | NOT PROVIDE | 2500.0 | 31-AUG-11 | Republican |
1001733 rows × 8 columns
(occupation_.rename(occupation).index == 'C.E.O.').sum()
0
利用函数进行数据转换:职业与雇主信息分析
建立一个职业对应字典,把相同职业的不同表达映射为对应的职业,比如把C.E.O.映射为CEO
2.3 数据筛选
赞助金额筛选,赞助金额大于0
cond = contb['contb_receipt_amt'] <= 0
contb[cond]
cand_nm | contbr_nm | contbr_st | contbr_employer | contbr_occupation | contb_receipt_amt | contb_receipt_dt | party | |
---|---|---|---|---|---|---|---|---|
41 | Bachmann, Michelle | MINNIS, RITA | CA | NOT PROVIDE | NOT PROVIDE | -1500.0 | 20-JUN-11 | Republican |
264 | Bachmann, Michelle | BISHOP, GERARD | NY | NOT PROVIDE | NOT PROVIDE | -1700.0 | 28-JUN-11 | Republican |
752 | Romney, Mitt | KNIGHT, RENA | AL | NOT PROVIDE | NOT PROVIDE | -60.0 | 07-MAR-12 | Republican |
885 | Romney, Mitt | HENDERSON, RONALD E. DR. | AL | RETIRED | RETIRED | -150.0 | 06-FEB-12 | Republican |
1033 | Romney, Mitt | SELLERS, LEE | AL | NOT PROVIDE | NOT PROVIDE | -120.0 | 20-JAN-12 | Republican |
... | ... | ... | ... | ... | ... | ... | ... | ... |
1001705 | Perry, Rick | NELSON, HOWARD L. MR. | WV | SWANSON INDUSTRIES | C.F.O. | -2500.0 | 01-OCT-11 | Republican |
1001706 | Perry, Rick | PHILLIPS, JIMMY D. MR. | WV | PHILLIPS MACHINE SERVICES | EXECUTIVE | -2500.0 | 28-NOV-11 | Republican |
1001709 | Perry, Rick | BEVILOCK, JOHN C. MR. | WV | DAVID STANLEY COMPANY | SALES | -2500.0 | 08-OCT-11 | Republican |
1001711 | Perry, Rick | HAYHURST, JAMES W. MR. | WV | DAVID STANLEY COMPANY | SALES | -2500.0 | 09-OCT-11 | Republican |
1001712 | Perry, Rick | PHILLIPS, JIMMY | WV | NOT PROVIDE | NOT PROVIDE | -2500.0 | 31-MAR-12 | Republican |
10256 rows × 8 columns
cond = contb['contb_receipt_amt'] > 0
contb = contb[cond]
# 将异常值,进行了过滤,定义异常值:捐款为负数
contb.shape
(991477, 8)
查看各候选人获得的赞助总金额
# 共和党的罗姆尼,民主党的奥巴马双雄
cand_nm_sum = contb.groupby(by = 'cand_nm')['contb_receipt_amt'].sum().sort_values(ascending = False)
饼图绘制各候选人献金情况
cand_nm_sum.plot(kind = 'bar')
<matplotlib.axes._subplots.AxesSubplot at 0x2a993e9ef28>
# 饼图,百分比形式,显示数据
cand_nm_sum[:5].plot(kind = 'pie')
<matplotlib.axes._subplots.AxesSubplot at 0x2a994216ac8>
筛选选取候选人为Obama、Romney的子集数据,多种方式
contb.shape
(991477, 8)
contb['cand_nm'].unique()
array(['Bachmann, Michelle', 'Romney, Mitt', 'Obama, Barack',
"Roemer, Charles E. 'Buddy' III", 'Pawlenty, Timothy',
'Johnson, Gary Earl', 'Paul, Ron', 'Santorum, Rick',
'Cain, Herman', 'Gingrich, Newt', 'McCotter, Thaddeus G',
'Huntsman, Jon', 'Perry, Rick'], dtype=object)
# 奥巴马和罗姆尼的子集,只有他俩
convtb_vs = contb.query("cand_nm == 'Obama, Barack' or cand_nm == 'Romney, Mitt'")
convtb_vs.shape
(694283, 8)
cond1 = contb['cand_nm'] == 'Romney, Mitt'
cond2 = contb['cand_nm'] == 'Obama, Barack'
cond = cond1|cond2
contb_vs2 = contb[cond]
contb_vs2.shape
(694283, 8)
2.4 面元化数据
分箱操作
接下来我们对该数据做另一种非常实用的分析,利用cut函数根据出资额大小将数据离散化到多个面元中
contb_vs2.head()
cand_nm | contbr_nm | contbr_st | contbr_employer | contbr_occupation | contb_receipt_amt | contb_receipt_dt | party | |
---|---|---|---|---|---|---|---|---|
411 | Romney, Mitt | ELDERBAUM, WILLIAM | AA | US GOVERNMENT | FOREIGN SERVICE OFFICER | 25.0 | 01-FEB-12 | Republican |
412 | Romney, Mitt | ELDERBAUM, WILLIAM | AA | US GOVERNMENT | FOREIGN SERVICE OFFICER | 110.0 | 01-FEB-12 | Republican |
413 | Romney, Mitt | CARLSEN, RICHARD | AE | DEFENSE INTELLIGENCE AGENCY | INTELLIGENCE ANALYST | 250.0 | 13-APR-12 | Republican |
414 | Romney, Mitt | DELUCA, PIERRE | AE | CISCO | ENGINEER | 30.0 | 21-AUG-11 | Republican |
415 | Romney, Mitt | SARGENT, MICHAEL | AE | RAYTHEON TECHNICAL SERVICES CORP | COMPUTER SYSTEMS ENGINEER | 100.0 | 07-MAR-12 | Republican |
convtb_vs.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 694283 entries, 411 to 701386
Data columns (total 8 columns):
cand_nm 694283 non-null object
contbr_nm 694283 non-null object
contbr_st 694283 non-null object
contbr_employer 694283 non-null object
contbr_occupation 694283 non-null object
contb_receipt_amt 694283 non-null float64
contb_receipt_dt 694283 non-null object
party 694283 non-null object
dtypes: float64(1), object(7)
memory usage: 47.7+ MB
labels = [0,10,100,1000,10000,100000,1000000,10000000]
convtb_vs['contb_receipt_amt'].sort_values()
325151 0.01
319758 0.01
323688 0.01
323661 0.01
319831 0.01
...
319478 526246.17
344419 1511192.17
344539 1679114.65
326651 1944042.43
325136 2014490.51
Name: contb_receipt_amt, Length: 694283, dtype: float64
contb_vs2['contb_receipt_amt']
411 25.0
412 110.0
413 250.0
414 30.0
415 100.0
...
701382 25.0
701383 250.0
701384 3.0
701385 25.0
701386 135.0
Name: contb_receipt_amt, Length: 694283, dtype: float64
contb_vs = contb_vs2.copy()
contb_vs['labels'] = pd.cut(contb_vs2['contb_receipt_amt'],bins=labels)
contb_vs2.loc[:,'label'] = pd.cut(contb_vs2['contb_receipt_amt'],bins=labels)
d:\python3.6.8\lib\site-packages\pandas\core\indexing.py:376: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
self.obj[key] = _infer_fill_value(value)
d:\python3.6.8\lib\site-packages\pandas\core\indexing.py:494: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
self.obj[item] = s
contb_vs2.drop('label',axis= 1,inplace=True)
d:\python3.6.8\lib\site-packages\pandas\core\frame.py:4117: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
errors=errors,
contb_vs2.head()
cand_nm | contbr_nm | contbr_st | contbr_employer | contbr_occupation | contb_receipt_amt | contb_receipt_dt | party | label | |
---|---|---|---|---|---|---|---|---|---|
411 | Romney, Mitt | ELDERBAUM, WILLIAM | AA | US GOVERNMENT | FOREIGN SERVICE OFFICER | 25.0 | 01-FEB-12 | Republican | (10, 100] |
412 | Romney, Mitt | ELDERBAUM, WILLIAM | AA | US GOVERNMENT | FOREIGN SERVICE OFFICER | 110.0 | 01-FEB-12 | Republican | (100, 1000] |
413 | Romney, Mitt | CARLSEN, RICHARD | AE | DEFENSE INTELLIGENCE AGENCY | INTELLIGENCE ANALYST | 250.0 | 13-APR-12 | Republican | (100, 1000] |
414 | Romney, Mitt | DELUCA, PIERRE | AE | CISCO | ENGINEER | 30.0 | 21-AUG-11 | Republican | (10, 100] |
415 | Romney, Mitt | SARGENT, MICHAEL | AE | RAYTHEON TECHNICAL SERVICES CORP | COMPUTER SYSTEMS ENGINEER | 100.0 | 07-MAR-12 | Republican | (10, 100] |
contb_vs.drop('labels',axis = 1,inplace=True)
3. 数据聚合与分组运算
3.1 透视表(pivot_table)分析党派和职业
# excel 中透视表功能
# sql 中处理数据时,也会用到透视表
# pandas提供方法了,显示透视表
按照党派、职业对赞助金额进行汇总,类似excel中的透视表操作,聚合函数为sum
cont
# 透视表和分组聚合一模一样
pt = pd.pivot_table(contb_vs,values=['contb_receipt_amt'],
index = ['contbr_occupation'],columns=['party'],aggfunc='sum',
fill_value=0)
pt.isnull().sum()
party
contb_receipt_amt Democrat 4435
Republican 26779
dtype: int64
pt
contb_receipt_amt | ||
---|---|---|
party | Democrat | Republican |
contbr_occupation | ||
MIXED-MEDIA ARTIST / STORYTELLER | 100.0 | 0.0 |
AREA VICE PRESIDENT | 250.0 | 0.0 |
RESEARCH ASSOCIATE | 100.0 | 0.0 |
TEACHER | 500.0 | 0.0 |
THERAPIST | 3900.0 | 0.0 |
... | ... | ... |
ZEPPOS AND ASSOCIATES | 1000.0 | 0.0 |
ZONE MANAGER | 135.0 | 0.0 |
ZOOKEEPER | 35.0 | 0.0 |
ZOOLOGIST | 400.0 | 0.0 |
ZOOLOGY EDUCATION | 25.0 | 0.0 |
33599 rows × 2 columns
contb_vs.groupby(['contbr_occupation','party'])['contb_receipt_amt'].sum().unstack()
party | Democrat | Republican |
---|---|---|
contbr_occupation | ||
MIXED-MEDIA ARTIST / STORYTELLER | 100.0 | NaN |
AREA VICE PRESIDENT | 250.0 | NaN |
RESEARCH ASSOCIATE | 100.0 | NaN |
TEACHER | 500.0 | NaN |
THERAPIST | 3900.0 | NaN |
... | ... | ... |
ZEPPOS AND ASSOCIATES | 1000.0 | NaN |
ZONE MANAGER | 135.0 | NaN |
ZOOKEEPER | 35.0 | NaN |
ZOOLOGIST | 400.0 | NaN |
ZOOLOGY EDUCATION | 25.0 | NaN |
33599 rows × 2 columns
过滤掉赞助金额小于200W的数据
cond = pt.sum(axis = 1) > 2000000
pt_large = pt[cond]
pt_large
contb_receipt_amt | ||
---|---|---|
party | Democrat | Republican |
contbr_occupation | ||
ATTORNEY | 14302461.84 | 5372424.02 |
CEO | 2074974.79 | 2898458.31 |
CONSULTANT | 2459912.71 | 1424894.01 |
EXECUTIVE | 1355161.05 | 2300947.03 |
HOMEMAKER | 4248875.80 | 8147446.22 |
INVESTOR | 884133.00 | 1537595.12 |
NOT PROVIDE | 13725187.32 | 11638509.84 |
PHYSICIAN | 3735124.94 | 1368023.96 |
PRESIDENT | 1878509.95 | 2491244.89 |
PROFESSOR | 2165071.08 | 161362.12 |
RETIRED | 25305316.38 | 11508473.59 |
index = pt_large.sum(axis = 1).sort_values().index
pt_large = pt_large.loc[index[::-1]]
绘制图形
pt_large.plot(kind = 'bar')
<matplotlib.axes._subplots.AxesSubplot at 0x2a997df5ac8>
pt_large.plot(kind = 'bar',stacked = True)
<matplotlib.axes._subplots.AxesSubplot at 0x2a9972d58d0>
3.2 分组级运算和转换
根据职业与雇主信息分别分组运算捐献金额总数
由于职业和雇主的处理非常相似,我们定义函数get_top_amounts()对两个字段进行分析处理
occ_result = contb_vs.groupby(['contbr_occupation','cand_nm'])['contb_receipt_amt'].sum().unstack()
occ_result.fillna(0,inplace = True)
occ_result
cand_nm | Obama, Barack | Romney, Mitt |
---|---|---|
contbr_occupation | ||
MIXED-MEDIA ARTIST / STORYTELLER | 100.0 | 0.0 |
AREA VICE PRESIDENT | 250.0 | 0.0 |
RESEARCH ASSOCIATE | 100.0 | 0.0 |
TEACHER | 500.0 | 0.0 |
THERAPIST | 3900.0 | 0.0 |
... | ... | ... |
ZEPPOS AND ASSOCIATES | 1000.0 | 0.0 |
ZONE MANAGER | 135.0 | 0.0 |
ZOOKEEPER | 35.0 | 0.0 |
ZOOLOGIST | 400.0 | 0.0 |
ZOOLOGY EDUCATION | 25.0 | 0.0 |
33599 rows × 2 columns
occ_result['Obama, Barack'].sort_values(ascending = False)[:20]
contbr_occupation
RETIRED 25305316.38
ATTORNEY 14302461.84
NOT PROVIDE 13725187.32
HOMEMAKER 4248875.80
PHYSICIAN 3735124.94
CONSULTANT 2459912.71
PROFESSOR 2165071.08
CEO 2074974.79
PRESIDENT 1878509.95
NOT EMPLOYED 1709188.20
EXECUTIVE 1355161.05
TEACHER 1250969.15
WRITER 1084188.88
OWNER 1001567.36
ENGINEER 951525.55
INVESTOR 884133.00
ARTIST 763125.00
MANAGER 762883.22
SELF-EMPLOYED 672393.40
STUDENT 628099.75
Name: Obama, Barack, dtype: float64
从数据可以看出,Obama更受精英群体(律师、医生、咨询顾问)的欢迎,Romney则得到更多企业家或企业高管的支持
occ_result['Romney, Mitt'].sort_values(ascending = False)[:20]
contbr_occupation
NOT PROVIDE 11638509.84
RETIRED 11508473.59
HOMEMAKER 8147446.22
ATTORNEY 5372424.02
CEO 2898458.31
PRESIDENT 2491244.89
EXECUTIVE 2300947.03
INVESTOR 1537595.12
CONSULTANT 1424894.01
PHYSICIAN 1368023.96
FINANCE 1143592.25
SELF-EMPLOYED 1092538.44
REAL ESTATE 1054801.00
BANKER 1009186.24
OWNER 875186.24
CHAIRMAN 805131.75
INVESTMENT BANKER 661035.00
INVESTMENTS 634922.00
BUSINESS OWNER 617389.86
MANAGER 577924.94
Name: Romney, Mitt, dtype: float64
同样的,使用get_top_amounts()对雇主进行分析处理
contb_vs.head()
cand_nm | contbr_nm | contbr_st | contbr_employer | contbr_occupation | contb_receipt_amt | contb_receipt_dt | party | label | |
---|---|---|---|---|---|---|---|---|---|
411 | Romney, Mitt | ELDERBAUM, WILLIAM | AA | US GOVERNMENT | FOREIGN SERVICE OFFICER | 25.0 | 01-FEB-12 | Republican | (10, 100] |
412 | Romney, Mitt | ELDERBAUM, WILLIAM | AA | US GOVERNMENT | FOREIGN SERVICE OFFICER | 110.0 | 01-FEB-12 | Republican | (100, 1000] |
413 | Romney, Mitt | CARLSEN, RICHARD | AE | DEFENSE INTELLIGENCE AGENCY | INTELLIGENCE ANALYST | 250.0 | 13-APR-12 | Republican | (100, 1000] |
414 | Romney, Mitt | DELUCA, PIERRE | AE | CISCO | ENGINEER | 30.0 | 21-AUG-11 | Republican | (10, 100] |
415 | Romney, Mitt | SARGENT, MICHAEL | AE | RAYTHEON TECHNICAL SERVICES CORP | COMPUTER SYSTEMS ENGINEER | 100.0 | 07-MAR-12 | Republican | (10, 100] |
compy_result = contb_vs.groupby(['contbr_employer','cand_nm'])['contb_receipt_amt'].sum().unstack()
compy_result.fillna(0,inplace = True)
compy_result['Obama, Barack'].sort_values(ascending = False)[:20].plot(kind = 'bar')
<matplotlib.axes._subplots.AxesSubplot at 0x2a9c64d0908>
compy_result['Romney, Mitt'].sort_values(ascending = False)[:20].plot(kind = 'bar')
<matplotlib.axes._subplots.AxesSubplot at 0x2a9c64115c0>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-QkgQlUxC-1574946989346)(output_97_1.png)]
Obama:微软、盛德国际律师事务所; Romney:瑞士瑞信银行、摩根斯坦利、高盛公司、巴克莱资本、H.I.G.资本
对赞助金额进行分组分析(matplotlib画图)
前面我们已经利用pd.cut()函数,根据出资额大小将数据离散化到多个面元中,接下来我们就要对每个离散化的面元进行分组分析
首先统计各出资区间的赞助笔数,这里用到unstack(),stack()函数是堆叠,unstack()函数就是不要堆叠,即把多层索引变为表格数据
接下来,我们再统计各区间的赞助金额
contb_vs.head()
cand_nm | contbr_nm | contbr_st | contbr_employer | contbr_occupation | contb_receipt_amt | contb_receipt_dt | party | label | |
---|---|---|---|---|---|---|---|---|---|
411 | Romney, Mitt | ELDERBAUM, WILLIAM | AA | US GOVERNMENT | FOREIGN SERVICE OFFICER | 25.0 | 01-FEB-12 | Republican | (10, 100] |
412 | Romney, Mitt | ELDERBAUM, WILLIAM | AA | US GOVERNMENT | FOREIGN SERVICE OFFICER | 110.0 | 01-FEB-12 | Republican | (100, 1000] |
413 | Romney, Mitt | CARLSEN, RICHARD | AE | DEFENSE INTELLIGENCE AGENCY | INTELLIGENCE ANALYST | 250.0 | 13-APR-12 | Republican | (100, 1000] |
414 | Romney, Mitt | DELUCA, PIERRE | AE | CISCO | ENGINEER | 30.0 | 21-AUG-11 | Republican | (10, 100] |
415 | Romney, Mitt | SARGENT, MICHAEL | AE | RAYTHEON TECHNICAL SERVICES CORP | COMPUTER SYSTEMS ENGINEER | 100.0 | 07-MAR-12 | Republican | (10, 100] |
contb_count = contb_vs.groupby(['label','cand_nm'])['party'].count().unstack()
contb_count.fillna(0,inplace = True)
contb_count
cand_nm | Obama, Barack | Romney, Mitt |
---|---|---|
label | ||
(0, 10] | 40563.0 | 3758.0 |
(10, 100] | 372280.0 | 31853.0 |
(100, 1000] | 153992.0 | 43357.0 |
(1000, 10000] | 22284.0 | 26186.0 |
(10000, 100000] | 2.0 | 1.0 |
(100000, 1000000] | 3.0 | 0.0 |
(1000000, 10000000] | 4.0 | 0.0 |
contb_count.plot(kind = 'bar')
<matplotlib.axes._subplots.AxesSubplot at 0x2a9c65c47b8>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-BTwQPKtc-1574946989347)(output_107_1.png)]
contb_count.plot(kind = 'bar',stacked = True)
<matplotlib.axes._subplots.AxesSubplot at 0x2a9c69846d8>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-u5MloaGv-1574946989350)(output_108_1.png)]
contb_count
cand_nm | Obama, Barack | Romney, Mitt |
---|---|---|
label | ||
(0, 10] | 40563.0 | 3758.0 |
(10, 100] | 372280.0 | 31853.0 |
(100, 1000] | 153992.0 | 43357.0 |
(1000, 10000] | 22284.0 | 26186.0 |
(10000, 100000] | 2.0 | 1.0 |
(100000, 1000000] | 3.0 | 0.0 |
(1000000, 10000000] | 4.0 | 0.0 |
# sql 中,进行这样操作,难度很大
count = (contb_count.sum(axis = 1))
count
label
(0, 10] 44321.0
(10, 100] 404133.0
(100, 1000] 197349.0
(1000, 10000] 48470.0
(10000, 100000] 3.0
(100000, 1000000] 3.0
(1000000, 10000000] 4.0
dtype: float64
for i,c in enumerate(count):
contb_count.iloc[i] = contb_count.iloc[i]/c
contb_count.plot(kind = 'bar',stacked = True)
<matplotlib.axes._subplots.AxesSubplot at 0x2a9c6984cf8>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-A1tA8Oim-1574946989351)(output_112_1.png)]
可以看出,小额赞助方面,Obama获得的数量和金额比Romney多得多
4.时间处理
4.1 str转datetime
contb_vs.head()
cand_nm | contbr_nm | contbr_st | contbr_employer | contbr_occupation | contb_receipt_amt | contb_receipt_dt | party | label | |
---|---|---|---|---|---|---|---|---|---|
411 | Romney, Mitt | ELDERBAUM, WILLIAM | AA | US GOVERNMENT | FOREIGN SERVICE OFFICER | 25.0 | 01-FEB-12 | Republican | (10, 100] |
412 | Romney, Mitt | ELDERBAUM, WILLIAM | AA | US GOVERNMENT | FOREIGN SERVICE OFFICER | 110.0 | 01-FEB-12 | Republican | (100, 1000] |
413 | Romney, Mitt | CARLSEN, RICHARD | AE | DEFENSE INTELLIGENCE AGENCY | INTELLIGENCE ANALYST | 250.0 | 13-APR-12 | Republican | (100, 1000] |
414 | Romney, Mitt | DELUCA, PIERRE | AE | CISCO | ENGINEER | 30.0 | 21-AUG-11 | Republican | (10, 100] |
415 | Romney, Mitt | SARGENT, MICHAEL | AE | RAYTHEON TECHNICAL SERVICES CORP | COMPUTER SYSTEMS ENGINEER | 100.0 | 07-MAR-12 | Republican | (10, 100] |
contb_vs.dtypes
cand_nm object
contbr_nm object
contbr_st object
contbr_employer object
contbr_occupation object
contb_receipt_amt float64
contb_receipt_dt object
party object
label category
dtype: object
contb_vs['contb_receipt_dt'] = pd.to_datetime(contb_vs['contb_receipt_dt'])
contb_vs.dtypes
cand_nm object
contbr_nm object
contbr_st object
contbr_employer object
contbr_occupation object
contb_receipt_amt float64
contb_receipt_dt datetime64[ns]
party object
label category
dtype: object
4.2 以时间作为索引
contb_vs.set_index('contb_receipt_dt',inplace=True)
contb_vs.sort_index(inplace=True)
4.3重采样和频度转换
重采样(Resampling)指的是把时间序列的频度变为另一个频度的过程。把高频度的数据变为低频度叫做降采样(downsampling),resample会对数据进行分组,然后再调用聚合函数。这里我们把频率从每日转换为每月,属于高频转低频的降采样
contb_vs.resample('M')['contb_receipt_amt'].sum()
contb_receipt_amt | |
---|---|
contb_receipt_dt | |
2011-04-30 | 10282228.92 |
2011-05-31 | 13426593.92 |
2011-06-30 | 20552077.13 |
2011-07-31 | 5932074.56 |
2011-08-31 | 7097173.03 |
2011-09-30 | 22826839.44 |
2011-10-31 | 9585231.97 |
2011-11-30 | 12200548.56 |
2011-12-31 | 24986600.28 |
2012-01-31 | 13536133.55 |
2012-02-29 | 26336337.17 |
2012-03-31 | 31926272.05 |
2012-04-30 | 25525424.19 |
ret = contb_vs.groupby(['cand_nm']).resample("M")['contb_receipt_amt'].sum().unstack(level = 0)
ret.plot(kind = 'bar')
<matplotlib.axes._subplots.AxesSubplot at 0x2a9c685e9b0>
ret = contb_vs.groupby(['cand_nm']).resample("M")['contb_receipt_amt'].sum().unstack(level = 0)
ret
cand_nm | Obama, Barack | Romney, Mitt |
---|---|---|
contb_receipt_dt | ||
2011-04-30 | 8246635.79 | 2035593.13 |
2011-05-31 | 6570013.79 | 6856580.13 |
2011-06-30 | 12159624.88 | 8392452.25 |
2011-07-31 | 3557892.31 | 2374182.25 |
2011-08-31 | 4685825.79 | 2411347.24 |
2011-09-30 | 15304567.88 | 7522271.56 |
2011-10-31 | 5945068.04 | 3640163.93 |
2011-11-30 | 5194383.31 | 7006165.25 |
2011-12-31 | 13571341.82 | 11415258.46 |
2012-01-31 | 8167000.77 | 5369132.78 |
2012-02-29 | 15858199.16 | 10478138.01 |
2012-03-31 | 20806093.40 | 11120178.65 |
2012-04-30 | 15810980.30 | 9714443.89 |
ret.cumsum().plot(kind = 'bar')
<matplotlib.axes._subplots.AxesSubplot at 0x2a9c6d44518>
ret.cumsum().plot(kind = 'line')
<matplotlib.axes._subplots.AxesSubplot at 0x2a9c6ca8278>
5.各州支持率
分组聚合计算奥巴马、罗姆尼在各州支持率
# 分组聚合进行运算,求各州,对奥巴马和罗姆尼支持情况,计算捐献金额
st = contb_vs.groupby(['cand_nm','contbr_st'])['contb_receipt_amt'].sum().unstack(level = 0)
st.fillna(0,inplace = True)
# 68个州,18个州,可以删除
# 献金者,填表,填错了
st
cand_nm | Obama, Barack | Romney, Mitt |
---|---|---|
contbr_st | ||
AA | 56405.00 | 135.00 |
AB | 2048.00 | 0.00 |
AE | 42973.75 | 5680.00 |
AK | 281840.15 | 86204.24 |
AL | 543123.48 | 527303.51 |
... | ... | ... |
WI | 1130155.46 | 270316.32 |
WV | 169154.47 | 126725.12 |
WY | 194046.74 | 252595.84 |
XX | 0.00 | 400250.00 |
ZZ | 5963.00 | 0.00 |
68 rows × 2 columns
# 将美国有哪些州,将州缩写获取
abbr = pd.read_csv('./state-abbrevs.csv')
sx = abbr['abbreviation']
sx
0 AL
1 AK
2 AZ
3 AR
4 CA
5 CO
6 CT
7 DE
8 DC
9 FL
10 GA
11 HI
12 ID
13 IL
14 IN
15 IA
16 KS
17 KY
18 LA
19 ME
20 MT
21 NE
22 NV
23 NH
24 NJ
25 NM
26 NY
27 NC
28 ND
29 OH
30 OK
31 OR
32 MD
33 MA
34 MI
35 MN
36 MS
37 MO
38 PA
39 RI
40 SC
41 SD
42 TN
43 TX
44 UT
45 VT
46 VA
47 WA
48 WV
49 WI
50 WY
Name: abbreviation, dtype: object
index = st.index
s = Series(index)
# cond过滤条件
cond = s.isin(sx.to_numpy())
# 索引自然数,替换索引
cond.index = st.index
# 根据条件进行过滤
state_support = st[cond]
state_support
cand_nm | Obama, Barack | Romney, Mitt |
---|---|---|
contbr_st | ||
AK | 281840.15 | 86204.24 |
AL | 543123.48 | 527303.51 |
AR | 359247.28 | 105556.00 |
AZ | 1506476.98 | 1888436.23 |
CA | 23824984.24 | 11237636.60 |
CO | 2132429.49 | 1506714.12 |
CT | 2068291.26 | 3499475.45 |
DC | 4373538.80 | 1025137.50 |
DE | 336669.14 | 82712.00 |
FL | 7318178.58 | 8338458.81 |
GA | 2786399.49 | 1995725.59 |
HI | 795212.64 | 111763.00 |
IA | 584829.10 | 208537.49 |
ID | 197538.06 | 787158.44 |
IL | 16443895.84 | 3628571.53 |
IN | 883691.81 | 542086.23 |
KS | 448038.57 | 326633.87 |
KY | 714954.32 | 666902.59 |
LA | 548013.54 | 991236.60 |
MA | 6649015.25 | 4710542.30 |
MD | 4832663.93 | 1633690.40 |
ME | 1167760.12 | 117151.84 |
MI | 2570307.25 | 2448109.87 |
MN | 1744387.14 | 293656.32 |
MO | 1320780.71 | 1371332.90 |
MS | 195197.17 | 330183.00 |
MT | 300225.24 | 161629.00 |
NC | 2357067.63 | 1273603.92 |
ND | 58999.90 | 39392.00 |
NE | 251311.97 | 178600.50 |
NH | 616994.85 | 424839.11 |
NJ | 3203257.93 | 3333320.20 |
NM | 906162.36 | 168011.36 |
NV | 710693.67 | 630049.49 |
NY | 14652118.51 | 10184212.63 |
OH | 1822728.83 | 1901560.66 |
OK | 594342.52 | 839483.96 |
OR | 1222942.39 | 526237.73 |
PA | 3713426.98 | 1941519.43 |
RI | 457352.22 | 187892.92 |
SC | 630732.94 | 402742.86 |
SD | 86530.76 | 166558.00 |
TN | 1119315.02 | 1516918.04 |
TX | 6570832.45 | 6221989.68 |
UT | 519851.37 | 3717300.48 |
VA | 4259977.19 | 3465765.85 |
VT | 986510.59 | 55229.44 |
WA | 4250933.16 | 1341521.56 |
WI | 1130155.46 | 270316.32 |
WV | 169154.47 | 126725.12 |
WY | 194046.74 | 252595.84 |
state_support.plot(kind = 'bar',stacked = True)
<matplotlib.axes._subplots.AxesSubplot at 0x2a9cfc56518>
df = DataFrame(state_support.sum(axis = 1))
df_sum = df.join(df,lsuffix = '_l',rsuffix = '_r')
df_sum.columns = ['Obama, Barack','Romney, Mitt']
df_sum
Obama, Barack | Romney, Mitt | |
---|---|---|
contbr_st | ||
AK | 368044.39 | 368044.39 |
AL | 1070426.99 | 1070426.99 |
AR | 464803.28 | 464803.28 |
AZ | 3394913.21 | 3394913.21 |
CA | 35062620.84 | 35062620.84 |
CO | 3639143.61 | 3639143.61 |
CT | 5567766.71 | 5567766.71 |
DC | 5398676.30 | 5398676.30 |
DE | 419381.14 | 419381.14 |
FL | 15656637.39 | 15656637.39 |
GA | 4782125.08 | 4782125.08 |
HI | 906975.64 | 906975.64 |
IA | 793366.59 | 793366.59 |
ID | 984696.50 | 984696.50 |
IL | 20072467.37 | 20072467.37 |
IN | 1425778.04 | 1425778.04 |
KS | 774672.44 | 774672.44 |
KY | 1381856.91 | 1381856.91 |
LA | 1539250.14 | 1539250.14 |
MA | 11359557.55 | 11359557.55 |
MD | 6466354.33 | 6466354.33 |
ME | 1284911.96 | 1284911.96 |
MI | 5018417.12 | 5018417.12 |
MN | 2038043.46 | 2038043.46 |
MO | 2692113.61 | 2692113.61 |
MS | 525380.17 | 525380.17 |
MT | 461854.24 | 461854.24 |
NC | 3630671.55 | 3630671.55 |
ND | 98391.90 | 98391.90 |
NE | 429912.47 | 429912.47 |
NH | 1041833.96 | 1041833.96 |
NJ | 6536578.13 | 6536578.13 |
NM | 1074173.72 | 1074173.72 |
NV | 1340743.16 | 1340743.16 |
NY | 24836331.14 | 24836331.14 |
OH | 3724289.49 | 3724289.49 |
OK | 1433826.48 | 1433826.48 |
OR | 1749180.12 | 1749180.12 |
PA | 5654946.41 | 5654946.41 |
RI | 645245.14 | 645245.14 |
SC | 1033475.80 | 1033475.80 |
SD | 253088.76 | 253088.76 |
TN | 2636233.06 | 2636233.06 |
TX | 12792822.13 | 12792822.13 |
UT | 4237151.85 | 4237151.85 |
VA | 7725743.04 | 7725743.04 |
VT | 1041740.03 | 1041740.03 |
WA | 5592454.72 | 5592454.72 |
WI | 1400471.78 | 1400471.78 |
WV | 295879.59 | 295879.59 |
WY | 446642.58 | 446642.58 |
state_precent = state_support/df_sum
state_precent.plot(kind = 'bar',stacked = True)
<matplotlib.axes._subplots.AxesSubplot at 0x2a9cfa766a0>
条形图显示对比情况
# basemape 工具包,pip install basemap
# basemape绘制地图,pip install 无法安装成功
# 下载:https://www.lfd.uci.edu/~gohlke/pythonlibs/
# 州全称对应缩写缩写
abbr = {'Commonwealth of Kentucky':'KY','Commonwealth of Massachusetts':'MA','Commonwealth of Pennsylvania':'PA',
'State of Rhode Island and Providence Plantations':'RI'}
import matplotlib.pyplot as plt
%matplotlib inline
from mpl_toolkits.basemap import Basemap
# hex16进制表示的颜色
from matplotlib.colors import rgb2hex
from matplotlib.patches import Polygon
'''llcrnrlon 所需地图域(度)的左下角经度。
llcrnrlat 所需地图域左下角的纬度(度)。
urcrnrlon 所需地图域(度)的右上角经度。
urcrnrlat 所需地图域右上角的纬度(度)。
'''
plt.figure(figsize=(12,9))
m = Basemap(llcrnrlon = -122,
llcrnrlat = 23.41,
urcrnrlon = -64,
urcrnrlat = 45,
projection = 'lcc',
lat_1 = 30,
lon_0 = -100
)
m.drawcoastlines(linewidth=1.5)
m.drawcountries(linewidth = 1.5)
# 地图读取了美国地图形状,m中就有各州的形状,数据
m.readshapefile('./USA/gadm36_USA_1',name = 'states')
colors = []
states = []
cmap = plt.cm.Reds
for shapeinfo in m.states_info:
a = shapeinfo['VARNAME_1']
# 州的缩写
s = a.split('|')[0]
try:
rate = oba[s]
colors.append(cmap(rate))
states.append(s)
except:
colors.append(cmap(oba[abbr[s]]))
states.append(s)
# 州填充颜色
# seg州中的一部分区域,多边形
ax = plt.gca()
for n,seg in enumerate(m.states):
c = rgb2hex(colors[n])
poly = Polygon(seg,color = c )
ax.add_patch(poly)
plt.savefig('./usa_states_rate.jpg')
plt.show()