import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
1.数据载入和总览
- 1.1数据载入
election = pd.read_csv('data2/usa_election.txt',low_memory=False)
election.head()
cmte_id | cand_id | cand_nm | contbr_nm | contbr_city | contbr_st | contbr_zip | contbr_employer | contbr_occupation | contb_receipt_amt | contb_receipt_dt | receipt_desc | memo_cd | memo_text | form_tp | file_num | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | C00410118 | P20002978 | Bachmann, Michelle | HARVEY, WILLIAM | MOBILE | AL | 366010290 | RETIRED | RETIRED | 250.0 | 20-JUN-11 | NaN | NaN | NaN | SA17A | 736166 |
1 | C00410118 | P20002978 | Bachmann, Michelle | HARVEY, WILLIAM | MOBILE | AL | 366010290 | RETIRED | RETIRED | 50.0 | 23-JUN-11 | NaN | NaN | NaN | SA17A | 736166 |
2 | C00410118 | P20002978 | Bachmann, Michelle | SMITH, LANIER | LANETT | AL | 368633403 | INFORMATION REQUESTED | INFORMATION REQUESTED | 250.0 | 05-JUL-11 | NaN | NaN | NaN | SA17A | 749073 |
3 | C00410118 | P20002978 | Bachmann, Michelle | BLEVINS, DARONDA | PIGGOTT | AR | 724548253 | NONE | RETIRED | 250.0 | 01-AUG-11 | NaN | NaN | NaN | SA17A | 749073 |
4 | C00410118 | P20002978 | Bachmann, Michelle | WARDENBURG, HAROLD | HOT SPRINGS NATION | AR | 719016467 | NONE | RETIRED | 300.0 | 20-JUN-11 | NaN | NaN | NaN | SA17A | 736166 |
- 1.2数据预览和基本统计分析
election.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 536041 entries, 0 to 536040
Data columns (total 16 columns):
cmte_id 536041 non-null object
cand_id 536041 non-null object
cand_nm 536041 non-null object
contbr_nm 536041 non-null object
contbr_city 536026 non-null object
contbr_st 536040 non-null object
contbr_zip 535973 non-null object
contbr_employer 525088 non-null object
contbr_occupation 530520 non-null object
contb_receipt_amt 536041 non-null float64
contb_receipt_dt 536041 non-null object
receipt_desc 8479 non-null object
memo_cd 49718 non-null object
memo_text 52740 non-null object
form_tp 536041 non-null object
file_num 536041 non-null int64
dtypes: float64(1), int64(1), object(14)
memory usage: 36.8+ MB
election.describe()
contb_receipt_amt | file_num | |
---|---|---|
count | 5.360410e+05 | 536041.000000 |
mean | 3.750373e+02 | 761472.107800 |
std | 3.564436e+03 | 5148.893508 |
min | -3.080000e+04 | 723511.000000 |
25% | 5.000000e+01 | 756218.000000 |
50% | 1.000000e+02 | 763233.000000 |
75% | 2.500000e+02 | 763621.000000 |
max | 1.944042e+06 | 767394.000000 |
2.数据清洗
- 2.1缺失值处理
- 填充:NOT PROVIDE
# 查看哪些列存在NAN值
election.isna().any(axis=0)
cmte_id False
cand_id False
cand_nm False
contbr_nm False
contbr_city True
contbr_st True
contbr_zip True
contbr_employer True
contbr_occupation True
contb_receipt_amt False
contb_receipt_dt False
receipt_desc True
memo_cd True
memo_text True
form_tp False
file_num False
dtype: bool
# 把NAN值进行填充
election.fillna('NOT PROVIDE',inplace=True)
election.isna().any(axis=0)
cmte_id False
cand_id False
cand_nm False
contbr_nm False
contbr_city False
contbr_st False
contbr_zip False
contbr_employer False
contbr_occupation False
contb_receipt_amt False
contb_receipt_dt False
receipt_desc False
memo_cd False
memo_text False
form_tp False
file_num False
dtype: bool
- 2.2数据转换
- 利用字典映射进行转换:党派分析
#通过搜索引擎等途径,获取到每个总统候选人的所属党派,建立字典parties,候选人名字作为键,所属党派作为对应的值"
parties = {'Bachmann, Michelle': 'Republican',
'Cain, Herman': 'Republican',
'Gingrich, Newt': 'Republican',
'Huntsman, Jon': 'Republican',
'Johnson, Gary Earl': 'Republican',
'McCotter, Thaddeus G': 'Republican',
'Obama, Barack': 'Democrat',
'Paul, Ron': 'Republican',
'Pawlenty, Timothy': 'Republican',
'Perry, Rick': 'Republican',
"Roemer, Charles E. 'Buddy' III": 'Republican',
'Romney, Mitt': 'Republican',
'Santorum, Rick': 'Republican'}
election.head()
cmte_id | cand_id | cand_nm | contbr_nm | contbr_city | contbr_st | contbr_zip | contbr_employer | contbr_occupation | contb_receipt_amt | contb_receipt_dt | receipt_desc | memo_cd | memo_text | form_tp | file_num | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | C00410118 | P20002978 | Bachmann, Michelle | HARVEY, WILLIAM | MOBILE | AL | 366010290 | RETIRED | RETIRED | 250.0 | 20-JUN-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 736166 |
1 | C00410118 | P20002978 | Bachmann, Michelle | HARVEY, WILLIAM | MOBILE | AL | 366010290 | RETIRED | RETIRED | 50.0 | 23-JUN-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 736166 |
2 | C00410118 | P20002978 | Bachmann, Michelle | SMITH, LANIER | LANETT | AL | 368633403 | INFORMATION REQUESTED | INFORMATION REQUESTED | 250.0 | 05-JUL-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 749073 |
3 | C00410118 | P20002978 | Bachmann, Michelle | BLEVINS, DARONDA | PIGGOTT | AR | 724548253 | NONE | RETIRED | 250.0 | 01-AUG-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 749073 |
4 | C00410118 | P20002978 | Bachmann, Michelle | WARDENBURG, HAROLD | HOT SPRINGS NATION | AR | 719016467 | NONE | RETIRED | 300.0 | 20-JUN-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 736166 |
election.dtypes
cmte_id object
cand_id object
cand_nm object
contbr_nm object
contbr_city object
contbr_st object
contbr_zip object
contbr_employer object
contbr_occupation object
contb_receipt_amt float64
contb_receipt_dt object
receipt_desc object
memo_cd object
memo_text object
form_tp object
file_num int64
dtype: object
- map方法添加party属性
election['party'] = election['cand_nm'].map(parties)
election.head()
cmte_id | cand_id | cand_nm | contbr_nm | contbr_city | contbr_st | contbr_zip | contbr_employer | contbr_occupation | contb_receipt_amt | contb_receipt_dt | receipt_desc | memo_cd | memo_text | form_tp | file_num | party | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | C00410118 | P20002978 | Bachmann, Michelle | HARVEY, WILLIAM | MOBILE | AL | 366010290 | RETIRED | RETIRED | 250.0 | 20-JUN-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 736166 | Republican |
1 | C00410118 | P20002978 | Bachmann, Michelle | HARVEY, WILLIAM | MOBILE | AL | 366010290 | RETIRED | RETIRED | 50.0 | 23-JUN-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 736166 | Republican |
2 | C00410118 | P20002978 | Bachmann, Michelle | SMITH, LANIER | LANETT | AL | 368633403 | INFORMATION REQUESTED | INFORMATION REQUESTED | 250.0 | 05-JUL-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 749073 | Republican |
3 | C00410118 | P20002978 | Bachmann, Michelle | BLEVINS, DARONDA | PIGGOTT | AR | 724548253 | NONE | RETIRED | 250.0 | 01-AUG-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 749073 | Republican |
4 | C00410118 | P20002978 | Bachmann, Michelle | WARDENBURG, HAROLD | HOT SPRINGS NATION | AR | 719016467 | NONE | RETIRED | 300.0 | 20-JUN-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 736166 | Republican |
- value_counts统计各党派总捐频次
# 候选人党派信息
election['cand_nm'].unique()
array(['Bachmann, Michelle', 'Romney, Mitt', 'Obama, Barack',
"Roemer, Charles E. 'Buddy' III", 'Pawlenty, Timothy',
'Johnson, Gary Earl', 'Paul, Ron', 'Santorum, Rick',
'Cain, Herman', 'Gingrich, Newt', 'McCotter, Thaddeus G',
'Huntsman, Jon', 'Perry, Rick'], dtype=object)
election['party'].value_counts()
Democrat 292400
Republican 243641
Name: party, dtype: int64
-
排序:按照职业的汇总对赞助总金额进行排序
-
按照职位进行汇总,计算赞助总金额,展示前50项,发现不少职业是相同的,只不过是表达不一样而已,如C.E.O与CEO,都是一个职业
# 查看岗位
election['contbr_occupation'].unique()
array(['RETIRED', 'INFORMATION REQUESTED', 'RN', ...,
'SECRETARY TO VICE PRESIDENT', 'PRESIDENT/TREASURER',
'LONGWALL MAINTENANCE FOREMAN'], dtype=object)
election['contbr_occupation'].value_counts()
RETIRED 117973
INFORMATION REQUESTED 23439
ATTORNEY 19943
HOMEMAKER 15756
PHYSICIAN 13171
...
POLYMER SCIENTIST 1
TAX PARTNER / CPA 1
STARTUP EDUCATION 1
C.P.A./SHAREHOLDER 1
INTERACTIVE PRODUCER 1
Name: contbr_occupation, Length: 29620, dtype: int64
# C.E.O与CEO,做映射处理,改成一样的
f = lambda item: 'CEO' if 'CEO' in item or item.find('C.E.O')>-1 else item
election['contbr_occupation'] = election['contbr_occupation'].map(f)
election['contbr_occupation'].value_counts()
RETIRED 117973
INFORMATION REQUESTED 23439
ATTORNEY 19943
HOMEMAKER 15756
PHYSICIAN 13171
...
HOMEAKER 1
OIL/GAS 1
TITLE INSURANCE SALES 1
EMS HELICOPTER MECHANIC 1
INTERACTIVE PRODUCER 1
Name: contbr_occupation, Length: 29201, dtype: int64
# 按照岗位进行分组
election.groupby('contbr_occupation')['contb_receipt_amt'].sum()
contbr_occupation
MIXED-MEDIA ARTIST / STORYTELLER 100.00
AREA VICE PRESIDENT 250.00
RESEARCH ASSOCIATE 100.00
TEACHER 500.00
THERAPIST 3900.00
...
ZIMMERMANS DAIRY 83.71
ZOMBIE SLAYER 1556.00
ZOOLOGIST 100.00
\NONE\ 250.00
~ 75.00
Name: contb_receipt_amt, Length: 29201, dtype: float64
# 分完组排序
election.groupby('contbr_occupation')['contb_receipt_amt'].sum().sort_values()
contbr_occupation
METAL SMITH -4250.00
PRES OF GAS & ELECTRIC -2500.00
AVIATION ATTORNEY -2500.00
MFG. REP -50.00
GOVERNMENT MEMBER 0.00
...
CEO 7310417.46
INFORMATION REQUESTED PER BEST EFFORTS 9832750.00
HOMEMAKER 12410217.32
ATTORNEY 12580812.94
RETIRED 29865435.00
Name: contb_receipt_amt, Length: 29201, dtype: float64
# 去掉里面负值排序
amt_sum = election.groupby('contbr_occupation')['contb_receipt_amt'].sum()
amt_sum.sort_values(ascending=False)
contbr_occupation
RETIRED 29865435.00
ATTORNEY 12580812.94
HOMEMAKER 12410217.32
INFORMATION REQUESTED PER BEST EFFORTS 9832750.00
CEO 7310417.46
...
GOVERNMENT MEMBER 0.00
MFG. REP -50.00
AVIATION ATTORNEY -2500.00
PRES OF GAS & ELECTRIC -2500.00
METAL SMITH -4250.00
Name: contb_receipt_amt, Length: 29201, dtype: float64
# 取前面五十个
amt_sum.sort_values(ascending=False)[:50]
contbr_occupation
RETIRED 29865435.00
ATTORNEY 12580812.94
HOMEMAKER 12410217.32
INFORMATION REQUESTED PER BEST EFFORTS 9832750.00
CEO 7310417.46
INFORMATION REQUESTED 6652373.48
PHYSICIAN 4732676.66
PRESIDENT 4663978.28
EXECUTIVE 3897020.71
CONSULTANT 3445289.59
INVESTOR 2499492.16
OWNER 2397779.75
LAWYER 2305835.79
ENGINEER 1743040.13
REAL ESTATE 1668360.55
PROFESSOR 1524503.64
NOT PROVIDE 1456138.20
MANAGER 1429909.01
SELF-EMPLOYED 1374817.30
CHAIRMAN 1339631.37
FINANCE 1330261.88
SALES 1225337.97
BUSINESS OWNER 1179904.84
STUDENT 1155684.17
BANKER 1133177.68
NOT EMPLOYED 1065220.57
TEACHER 984944.53
PARTNER 933350.51
INVESTMENTS 884374.03
VICE PRESIDENT 861050.63
WRITER 843070.91
INVESTMENT BANKER 812166.20
MANAGING DIRECTOR 733055.32
DIRECTOR 583423.44
ARTIST 570839.79
ACCOUNTANT 568414.18
SOFTWARE ENGINEER 559254.70
DENTIST 494070.23
PRIVATE EQUITY 479509.00
FARMER 469295.72
INVESTMENT MANAGER 468120.80
REAL ESTATE DEVELOPER 458958.00
INVESTMENT MANAGEMENT 443547.52
FINANCIAL ADVISOR 436870.72
PRINCIPAL 432636.32
ARCHITECT 416997.09
CPA 402906.76
MANAGING PARTNER 399183.80
C.P.A. 388737.11
ENTREPRENEUR 383773.62
Name: contb_receipt_amt, dtype: float64
-
利用函数进行数据转换:职业与雇主信息分析
-
建立一个职业对应字典,把相同职业的不同表达映射为对应的职业,比如把C.E.O映射CEO
-
2.3 数据筛选
-
赞助金额筛选,赞助金额大于0
amt_election = election.query('contb_receipt_amt >0')
amt_election.head()
cmte_id | cand_id | cand_nm | contbr_nm | contbr_city | contbr_st | contbr_zip | contbr_employer | contbr_occupation | contb_receipt_amt | contb_receipt_dt | receipt_desc | memo_cd | memo_text | form_tp | file_num | party | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | C00410118 | P20002978 | Bachmann, Michelle | HARVEY, WILLIAM | MOBILE | AL | 366010290 | RETIRED | RETIRED | 250.0 | 20-JUN-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 736166 | Republican |
1 | C00410118 | P20002978 | Bachmann, Michelle | HARVEY, WILLIAM | MOBILE | AL | 366010290 | RETIRED | RETIRED | 50.0 | 23-JUN-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 736166 | Republican |
2 | C00410118 | P20002978 | Bachmann, Michelle | SMITH, LANIER | LANETT | AL | 368633403 | INFORMATION REQUESTED | INFORMATION REQUESTED | 250.0 | 05-JUL-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 749073 | Republican |
3 | C00410118 | P20002978 | Bachmann, Michelle | BLEVINS, DARONDA | PIGGOTT | AR | 724548253 | NONE | RETIRED | 250.0 | 01-AUG-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 749073 | Republican |
4 | C00410118 | P20002978 | Bachmann, Michelle | WARDENBURG, HAROLD | HOT SPRINGS NATION | AR | 719016467 | NONE | RETIRED | 300.0 | 20-JUN-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 736166 | Republican |
- 查看候选人获得的赞助总金额
amt_election.groupby('cand_nm')['contb_receipt_amt'].sum()
cand_nm
Bachmann, Michelle 2.639510e+06
Cain, Herman 7.058163e+06
Gingrich, Newt 9.895836e+06
Huntsman, Jon 3.324373e+06
Johnson, Gary Earl 4.132769e+05
McCotter, Thaddeus G 3.903000e+04
Obama, Barack 8.259441e+07
Paul, Ron 1.559408e+07
Pawlenty, Timothy 5.978124e+06
Perry, Rick 2.020167e+07
Roemer, Charles E. 'Buddy' III 3.429658e+05
Romney, Mitt 5.702315e+07
Santorum, Rick 3.364115e+06
Name: contb_receipt_amt, dtype: float64
# 单位划分到一百万
amt_election.groupby('cand_nm')['contb_receipt_amt'].sum() // 1000000
cand_nm
Bachmann, Michelle 2.0
Cain, Herman 7.0
Gingrich, Newt 9.0
Huntsman, Jon 3.0
Johnson, Gary Earl 0.0
McCotter, Thaddeus G 0.0
Obama, Barack 82.0
Paul, Ron 15.0
Pawlenty, Timothy 5.0
Perry, Rick 20.0
Roemer, Charles E. 'Buddy' III 0.0
Romney, Mitt 57.0
Santorum, Rick 3.0
Name: contb_receipt_amt, dtype: float64
# 排序
s1 =(amt_election.groupby('cand_nm')['contb_receipt_amt'].sum() // 1000000).sort_values()
s1
cand_nm
Johnson, Gary Earl 0.0
McCotter, Thaddeus G 0.0
Roemer, Charles E. 'Buddy' III 0.0
Bachmann, Michelle 2.0
Huntsman, Jon 3.0
Santorum, Rick 3.0
Pawlenty, Timothy 5.0
Cain, Herman 7.0
Gingrich, Newt 9.0
Paul, Ron 15.0
Perry, Rick 20.0
Romney, Mitt 57.0
Obama, Barack 82.0
Name: contb_receipt_amt, dtype: float64
- 饼图绘制各候选人政治献金情况
s1.plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x7fe8370>
- 筛选选取候选人为Obama、Romney的子集数据,多种方式
amt_election
cmte_id | cand_id | cand_nm | contbr_nm | contbr_city | contbr_st | contbr_zip | contbr_employer | contbr_occupation | contb_receipt_amt | contb_receipt_dt | receipt_desc | memo_cd | memo_text | form_tp | file_num | party | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | C00410118 | P20002978 | Bachmann, Michelle | HARVEY, WILLIAM | MOBILE | AL | 366010290 | RETIRED | RETIRED | 250.0 | 20-JUN-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 736166 | Republican |
1 | C00410118 | P20002978 | Bachmann, Michelle | HARVEY, WILLIAM | MOBILE | AL | 366010290 | RETIRED | RETIRED | 50.0 | 23-JUN-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 736166 | Republican |
2 | C00410118 | P20002978 | Bachmann, Michelle | SMITH, LANIER | LANETT | AL | 368633403 | INFORMATION REQUESTED | INFORMATION REQUESTED | 250.0 | 05-JUL-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 749073 | Republican |
3 | C00410118 | P20002978 | Bachmann, Michelle | BLEVINS, DARONDA | PIGGOTT | AR | 724548253 | NONE | RETIRED | 250.0 | 01-AUG-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 749073 | Republican |
4 | C00410118 | P20002978 | Bachmann, Michelle | WARDENBURG, HAROLD | HOT SPRINGS NATION | AR | 719016467 | NONE | RETIRED | 300.0 | 20-JUN-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 736166 | Republican |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
536036 | C00500587 | P20003281 | Perry, Rick | ANDERSON, MARILEE MRS. | INFO REQUESTED | XX | 99999 | INFORMATION REQUESTED PER BEST EFFORTS | INFORMATION REQUESTED PER BEST EFFORTS | 2500.0 | 31-AUG-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 751678 | Republican |
536037 | C00500587 | P20003281 | Perry, Rick | TOLBERT, DARYL MR. | INFO REQUESTED | XX | 99999 | T.A.C.C. | LONGWALL MAINTENANCE FOREMAN | 500.0 | 30-SEP-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 751678 | Republican |
536038 | C00500587 | P20003281 | Perry, Rick | GRANE, BRYAN F. MR. | INFO REQUESTED | XX | 99999 | INFORMATION REQUESTED PER BEST EFFORTS | INFORMATION REQUESTED PER BEST EFFORTS | 500.0 | 29-SEP-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 751678 | Republican |
536039 | C00500587 | P20003281 | Perry, Rick | DUFFY, DAVID A. MR. | INFO REQUESTED | XX | 99999 | DUFFY EQUIPMENT COMPANY INC. | BUSINESS OWNER | 2500.0 | 30-SEP-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 751678 | Republican |
536040 | C00500587 | P20003281 | Perry, Rick | GORMAN, CHRIS D. MR. | INFO REQUESTED | XX | 99999 | INFORMATION REQUESTED PER BEST EFFORTS | INFORMATION REQUESTED PER BEST EFFORTS | 5000.0 | 29-SEP-11 | REATTRIBUTION / REDESIGNATION REQUESTED (AUTOM... | NOT PROVIDE | REATTRIBUTION / REDESIGNATION REQUESTED (AUTOM... | SA17A | 751678 | Republican |
530314 rows × 17 columns
Obama = amt_election.query('cand_nm == "Obama, Barack"')
Obama
cmte_id | cand_id | cand_nm | contbr_nm | contbr_city | contbr_st | contbr_zip | contbr_employer | contbr_occupation | contb_receipt_amt | contb_receipt_dt | receipt_desc | memo_cd | memo_text | form_tp | file_num | party | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
58829 | C00431445 | P80003338 | Obama, Barack | HILL, HARRY A | AICHI-GUN | AA | NOT PROVIDE | OAK LAWN MARKETING INC. | PRESIDENT | 2500.0 | 11-DEC-11 | NOT PROVIDE | X | * OBAMA VICTORY FUND 2012 | SA18 | 763233 | Democrat |
58831 | C00431445 | P80003338 | Obama, Barack | TORCHIANA, WILLIAM D. | FRANCE | AA | NOT PROVIDE | SULLIVAN & CROMWELL | LAWYER | 2500.0 | 25-NOV-11 | NOT PROVIDE | X | * OBAMA VICTORY FUND 2012 | SA18 | 763233 | Democrat |
58832 | C00431445 | P80003338 | Obama, Barack | BERGMAN, MARK | LONDON | AA | NOT PROVIDE | PAUL WEISS RIFKIND ET AL | ATTORNEY | 2500.0 | 16-DEC-11 | NOT PROVIDE | X | * OBAMA VICTORY FUND 2012 | SA18 | 763233 | Democrat |
58833 | C00431445 | P80003338 | Obama, Barack | BRIDGEWATER, PAMELA | DPO | AA | 340080001 | DEPT. OF STATE | DIPLOMAT | 500.0 | 24-JAN-12 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 766535 | Democrat |
58834 | C00431445 | P80003338 | Obama, Barack | HIGHTOWER, CAROLYN | DPO | AA | 340049997 | DEPARTMENT OF STATE | PUBLIC ADMINISTRATOR | 500.0 | 12-JAN-12 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 766535 | Democrat |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
351224 | C00431445 | P80003338 | Obama, Barack | MOUNTS, ROBERT | APO | ZZ | 962043077 | HQ USFK (FKDC-SA) | GS-15 INTERNATIONAL RELATIONS OFFICER | 25.0 | 26-NOV-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 763233 | Democrat |
351225 | C00431445 | P80003338 | Obama, Barack | MOUNTS, ROBERT | APO | ZZ | 962043077 | HQ USFK (FKDC-SA) | GS-15 INTERNATIONAL RELATIONS OFFICER | 25.0 | 26-JAN-12 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 766535 | Democrat |
351226 | C00431445 | P80003338 | Obama, Barack | JONES, LESLIE | HAMILTON PARISH | ZZ | 0CR04 | SELF-EMPLOYED | ATTORNEY | 2500.0 | 30-DEC-11 | NOT PROVIDE | X | * OBAMA VICTORY FUND 2012 | SA18 | 763233 | Democrat |
351227 | C00431445 | P80003338 | Obama, Barack | JONES, LESLIE | HAMILTON PARISH | ZZ | 0CR04 | SELF-EMPLOYED | ATTORNEY | 2500.0 | 30-DEC-11 | NOT PROVIDE | X | * OBAMA VICTORY FUND 2012 | SA18 | 763233 | Democrat |
351228 | C00431445 | P80003338 | Obama, Barack | MOUNTS, ROBERT | APO | ZZ | 962043077 | HQ USFK (FKDC-SA) | GS-15 INTERNATIONAL RELATIONS OFFICER | 25.0 | 26-OCT-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 763233 | Democrat |
289999 rows × 17 columns
# 另一种取法
romeny = amt_election[amt_election['cand_nm'] == "Romney, Mitt"]
romeny
cmte_id | cand_id | cand_nm | contbr_nm | contbr_city | contbr_st | contbr_zip | contbr_employer | contbr_occupation | contb_receipt_amt | contb_receipt_dt | receipt_desc | memo_cd | memo_text | form_tp | file_num | party | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
409 | C00431171 | P80003353 | Romney, Mitt | EDWARDS, BRYANT MR. | LONDON UK | AE | 90071 | LATHAM & WATKINS | ATTORNEY | 2500.0 | 20-JUN-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 760248 | Republican |
410 | C00431171 | P80003353 | Romney, Mitt | DELUCA, PIERRE | APO | AE | 091280005 | CISCO | ENGINEER | 250.0 | 28-APR-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 760248 | Republican |
411 | C00431171 | P80003353 | Romney, Mitt | DELUCA, PIERRE | APO | AE | 091280005 | CISCO | ENGINEER | 250.0 | 27-JUN-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 760248 | Republican |
412 | C00431171 | P80003353 | Romney, Mitt | KAMMER, DONALD | APO | AE | 09267 | US | MILITARY | 500.0 | 31-DEC-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 763507 | Republican |
413 | C00431171 | P80003353 | Romney, Mitt | CLAUDE, FOUKS | PARIS FRANCE | AE | 75011 | PALPATINE | SECRET | 250.0 | 29-DEC-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 763507 | Republican |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
58824 | C00431171 | P80003353 | Romney, Mitt | KANUTSIZ, SINAN | INFO REQUESTED | XX | 99999 | K-COMM | OWNER | 500.0 | 23-SEP-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 760261 | Republican |
58825 | C00431171 | P80003353 | Romney, Mitt | DUFFEY, JAMES D. MR. JR. | INFO REQUESTED | XX | 99999 | INFORMATION REQUESTED PER BEST EFFORTS | INFORMATION REQUESTED PER BEST EFFORTS | 500.0 | 28-NOV-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 763507 | Republican |
58826 | C00431171 | P80003353 | Romney, Mitt | OSBORN, JOHN E. MR. | INFO REQUESTED | XX | 99999 | INFORMATION REQUESTED PER BEST EFFORTS | INFORMATION REQUESTED PER BEST EFFORTS | 1000.0 | 17-AUG-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 760261 | Republican |
58827 | C00431171 | P80003353 | Romney, Mitt | MAFFUCCI, NANCY L. MS. | INFO REQUESTED | XX | 99999 | WUNDERMAN | MARKETING COMMUNICATIONS | 500.0 | 07-JUL-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 760261 | Republican |
58828 | C00431171 | P80003353 | Romney, Mitt | MAFFUCCI, NANCY L. MS. | INFO REQUESTED | XX | 99999 | WUNDERMAN | MARKETING COMMUNICATIONS | 1000.0 | 23-SEP-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 760261 | Republican |
57471 rows × 17 columns
3.数据聚合与分组
-
3.1透视表(pivot_table)分析党派和职业
-
按照党派、职业对赞助金额进行汇总,类似excel中的透视表操作,聚合函数为sum
s2 =amt_election.pivot_table(['contb_receipt_amt'],index=['party','contbr_occupation'],aggfunc='sum')
s2
contb_receipt_amt | ||
---|---|---|
party | contbr_occupation | |
Democrat | MIXED-MEDIA ARTIST / STORYTELLER | 100.00 |
AREA VICE PRESIDENT | 250.00 | |
RESEARCH ASSOCIATE | 100.00 | |
TEACHER | 500.00 | |
THERAPIST | 3900.00 | |
... | ... | ... |
Republican | ZIG ZAG RESTAURANT GROUP | 950.00 |
ZIMMERMANS DAIRY | 83.71 | |
ZOMBIE SLAYER | 1556.00 | |
\NONE\ | 250.00 | |
~ | 75.00 |
32750 rows × 1 columns
- 过滤掉赞助金额小于200W的数据
s2 > 2000000
contb_receipt_amt | ||
---|---|---|
party | contbr_occupation | |
Democrat | MIXED-MEDIA ARTIST / STORYTELLER | False |
AREA VICE PRESIDENT | False | |
RESEARCH ASSOCIATE | False | |
TEACHER | False | |
THERAPIST | False | |
... | ... | ... |
Republican | ZIG ZAG RESTAURANT GROUP | False |
ZIMMERMANS DAIRY | False | |
ZOMBIE SLAYER | False | |
\NONE\ | False | |
~ | False |
32750 rows × 1 columns
s2.unstack()
contb_receipt_amt | |||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
contbr_occupation | MIXED-MEDIA ARTIST / STORYTELLER | AREA VICE PRESIDENT | RESEARCH ASSOCIATE | TEACHER | THERAPIST | (PART-TIME) SALES CONSULTANT & WRITER | (RETIRED) | - | -- | .NET PROGRAMMER | ... | YOUTH SPECIALIST | YOUTH WORKER | ZEN BUDDHIST PRIEST | ZEPPOS AND ASSOCIATES | ZIG ZAG RESTAURANT GROUP | ZIMMERMANS DAIRY | ZOMBIE SLAYER | ZOOLOGIST | \NONE\ | ~ |
party | |||||||||||||||||||||
Democrat | 100.0 | 250.0 | 100.0 | 500.0 | 3900.0 | NaN | NaN | 5000.0 | NaN | 441.0 | ... | 525.0 | NaN | 300.0 | 1000.0 | NaN | NaN | NaN | 100.0 | NaN | NaN |
Republican | NaN | NaN | NaN | NaN | NaN | 285.0 | 250.0 | 3085.0 | 425.0 | NaN | ... | NaN | 595.12 | NaN | NaN | 950.0 | 83.71 | 1556.0 | NaN | 250.0 | 75.0 |
2 rows × 29198 columns
# 把NAN填充,用0填充
s2.unstack(fill_value=0)
contb_receipt_amt | |||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
contbr_occupation | MIXED-MEDIA ARTIST / STORYTELLER | AREA VICE PRESIDENT | RESEARCH ASSOCIATE | TEACHER | THERAPIST | (PART-TIME) SALES CONSULTANT & WRITER | (RETIRED) | - | -- | .NET PROGRAMMER | ... | YOUTH SPECIALIST | YOUTH WORKER | ZEN BUDDHIST PRIEST | ZEPPOS AND ASSOCIATES | ZIG ZAG RESTAURANT GROUP | ZIMMERMANS DAIRY | ZOMBIE SLAYER | ZOOLOGIST | \NONE\ | ~ |
party | |||||||||||||||||||||
Democrat | 100.0 | 250.0 | 100.0 | 500.0 | 3900.0 | 0.0 | 0.0 | 5000.0 | 0.0 | 441.0 | ... | 525.0 | 0.00 | 300.0 | 1000.0 | 0.0 | 0.00 | 0.0 | 100.0 | 0.0 | 0.0 |
Republican | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 285.0 | 250.0 | 3085.0 | 425.0 | 0.0 | ... | 0.0 | 595.12 | 0.0 | 0.0 | 950.0 | 83.71 | 1556.0 | 0.0 | 250.0 | 75.0 |
2 rows × 29198 columns
s2.unstack(level=0,fill_value=0)
contb_receipt_amt | ||
---|---|---|
party | Democrat | Republican |
contbr_occupation | ||
MIXED-MEDIA ARTIST / STORYTELLER | 100.0 | 0.00 |
AREA VICE PRESIDENT | 250.0 | 0.00 |
RESEARCH ASSOCIATE | 100.0 | 0.00 |
TEACHER | 500.0 | 0.00 |
THERAPIST | 3900.0 | 0.00 |
... | ... | ... |
ZIMMERMANS DAIRY | 0.0 | 83.71 |
ZOMBIE SLAYER | 0.0 | 1556.00 |
ZOOLOGIST | 100.0 | 0.00 |
\NONE\ | 0.0 | 250.00 |
~ | 0.0 | 75.00 |
29198 rows × 2 columns
- 绘制图形
s2.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x1172dd0>
s22 = s2.unstack(level=0,fill_value=0)
s22
contb_receipt_amt | ||
---|---|---|
party | Democrat | Republican |
contbr_occupation | ||
MIXED-MEDIA ARTIST / STORYTELLER | 100.0 | 0.00 |
AREA VICE PRESIDENT | 250.0 | 0.00 |
RESEARCH ASSOCIATE | 100.0 | 0.00 |
TEACHER | 500.0 | 0.00 |
THERAPIST | 3900.0 | 0.00 |
... | ... | ... |
ZIMMERMANS DAIRY | 0.0 | 83.71 |
ZOMBIE SLAYER | 0.0 | 1556.00 |
ZOOLOGIST | 100.0 | 0.00 |
\NONE\ | 0.0 | 250.00 |
~ | 0.0 | 75.00 |
29198 rows × 2 columns
s22_1 = s22.sum(axis=1)
s22_1
contbr_occupation
MIXED-MEDIA ARTIST / STORYTELLER 100.00
AREA VICE PRESIDENT 250.00
RESEARCH ASSOCIATE 100.00
TEACHER 500.00
THERAPIST 3900.00
...
ZIMMERMANS DAIRY 83.71
ZOMBIE SLAYER 1556.00
ZOOLOGIST 100.00
\NONE\ 250.00
~ 75.00
Length: 29198, dtype: float64
s3 = amt_election.pivot_table(['contb_receipt_amt'],
columns='party',
index=['contbr_occupation'],aggfunc='sum')
s3.fillna(0,inplace=True)
s3
contb_receipt_amt | ||
---|---|---|
party | Democrat | Republican |
contbr_occupation | ||
MIXED-MEDIA ARTIST / STORYTELLER | 100.0 | 0.00 |
AREA VICE PRESIDENT | 250.0 | 0.00 |
RESEARCH ASSOCIATE | 100.0 | 0.00 |
TEACHER | 500.0 | 0.00 |
THERAPIST | 3900.0 | 0.00 |
... | ... | ... |
ZIMMERMANS DAIRY | 0.0 | 83.71 |
ZOMBIE SLAYER | 0.0 | 1556.00 |
ZOOLOGIST | 100.0 | 0.00 |
\NONE\ | 0.0 | 250.00 |
~ | 0.0 | 75.00 |
29198 rows × 2 columns
s3[s22.sum(axis=1) > 1000000]
contb_receipt_amt | ||
---|---|---|
party | Democrat | Republican |
contbr_occupation | ||
ATTORNEY | 7115093.35 | 5561875.69 |
BANKER | 147439.00 | 1000738.68 |
BUSINESS OWNER | 313243.70 | 889236.14 |
CEO | 2382425.67 | 5284341.79 |
CHAIRMAN | 390823.00 | 1021352.75 |
CONSULTANT | 1636016.36 | 1859573.23 |
ENGINEER | 552192.40 | 1218073.23 |
EXECUTIVE | 972931.63 | 3088889.08 |
FINANCE | 154856.40 | 1202855.48 |
HOMEMAKER | 2784593.60 | 9899812.15 |
INFORMATION REQUESTED | 2654049.90 | 4006935.69 |
INFORMATION REQUESTED PER BEST EFFORTS | 0.00 | 10149354.09 |
INVESTOR | 639607.00 | 1954885.16 |
LAWYER | 1974727.92 | 343741.87 |
MANAGER | 419079.16 | 1044834.85 |
NOT EMPLOYED | 1035639.53 | 30581.04 |
NOT PROVIDE | 4560685.43 | 591690.46 |
OWNER | 643033.42 | 1848252.45 |
PHYSICIAN | 2244445.92 | 2554100.10 |
PRESIDENT | 1285419.95 | 3552588.33 |
PROFESSOR | 1322512.54 | 203491.10 |
REAL ESTATE | 377200.55 | 1326560.00 |
RETIRED | 15103811.07 | 15177817.96 |
SALES | 210130.34 | 1039807.63 |
SELF-EMPLOYED | 378857.22 | 1018210.08 |
STUDENT | 414364.53 | 752169.64 |
# 大于一百万的
s3[s22.sum(axis=1) > 1000000].plot()
<matplotlib.axes._subplots.AxesSubplot at 0x5f480f0>
# 大于两百万的
s3[s22.sum(axis=1) > 2000000].plot()
<matplotlib.axes._subplots.AxesSubplot at 0x5f177b0>
# 大于两百万的
s3[s22.sum(axis=1) > 2000000].plot(kind='bar',stacked=True)
<matplotlib.axes._subplots.AxesSubplot at 0x5f0edf0>
- 3.2分组级运算和转换
- 根据职业与雇主信息分布分组运算捐献金额总数
- 由于职业和雇主的处理非常相似,我们定义函数get_top_amounts()对两个字段进行分析处理
- 从数据可以看出,Obama更受精英群体(律师、医生、咨询顾问)的欢迎,Romney则得到更多企业家或高管的支持
- 同样的,使用get_top_amounts()对雇主进行分析处理
- Obama:微软、盛德国际律师事务所;Rommney:瑞士瑞信银行、摩根斯坦利、高盛公司、巴克莱资本、H.I.G.资本
amt_election.columns
Index(['cmte_id', 'cand_id', 'cand_nm', 'contbr_nm', 'contbr_city',
'contbr_st', 'contbr_zip', 'contbr_employer', 'contbr_occupation',
'contb_receipt_amt', 'contb_receipt_dt', 'receipt_desc', 'memo_cd',
'memo_text', 'form_tp', 'file_num', 'party'],
dtype='object')
amt_election.pivot_table('contb_receipt_amt',
index='contbr_occupation',
aggfunc='sum')
contb_receipt_amt | |
---|---|
contbr_occupation | |
MIXED-MEDIA ARTIST / STORYTELLER | 100.00 |
AREA VICE PRESIDENT | 250.00 |
RESEARCH ASSOCIATE | 100.00 |
TEACHER | 500.00 |
THERAPIST | 3900.00 |
... | ... |
ZIMMERMANS DAIRY | 83.71 |
ZOMBIE SLAYER | 1556.00 |
ZOOLOGIST | 100.00 |
\NONE\ | 250.00 |
~ | 75.00 |
29198 rows × 1 columns
# 处理特殊符号
# 对小于4的字符进行处理
f2 = lambda item:np.nan if len(item) < 4 else item
amt_election['contbr_employer'] = amt_election['contbr_employer'].map(f2)
D:\yingyong\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""
amt_election['contbr_employer'].isna()
0 False
1 False
2 False
3 False
4 False
...
536036 False
536037 False
536038 False
536039 False
536040 False
Name: contbr_employer, Length: 530314, dtype: bool
amt_election[amt_election['contbr_employer'].isna()]
cmte_id | cand_id | cand_nm | contbr_nm | contbr_city | contbr_st | contbr_zip | contbr_employer | contbr_occupation | contb_receipt_amt | contb_receipt_dt | receipt_desc | memo_cd | memo_text | form_tp | file_num | party | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
172 | C00410118 | P20002978 | Bachmann, Michelle | GRANARA, FRANK | NORWELL | MA | 02061 | NaN | PRESIDENT | 500.0 | 30-JUN-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 736166 | Republican |
356 | C00410118 | P20002978 | Bachmann, Michelle | FITE, ROY | HOUSTON | TX | 770423445 | NaN | AUDITOR | 500.0 | 21-JUN-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 736166 | Republican |
412 | C00431171 | P80003353 | Romney, Mitt | KAMMER, DONALD | APO | AE | 09267 | NaN | MILITARY | 500.0 | 31-DEC-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 763507 | Republican |
414 | C00431171 | P80003353 | Romney, Mitt | JONES, BRENDON | QATAR | AE | 09898 | NaN | MEDIA ANALYST | 250.0 | 22-NOV-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 763507 | Republican |
804 | C00431171 | P80003353 | Romney, Mitt | FONDREN, SHELIA MRS. | BIRMINGHAM | AL | 352163796 | NaN | VP OF HUMAN RESOURCES | 100.0 | 01-NOV-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 763507 | Republican |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
535693 | C00500587 | P20003281 | Perry, Rick | CORLE, FREDERIC W. MR. | ALEXANDRIA | VA | 223141575 | NaN | SR DIR | 1000.0 | 29-SEP-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 751678 | Republican |
535741 | C00500587 | P20003281 | Perry, Rick | KOZUCH, RANDY J. MR. | ALEXANDRIA | VA | 223146250 | NaN | DIRECTOR | 1000.0 | 30-SEP-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 751678 | Republican |
535750 | C00500587 | P20003281 | Perry, Rick | MCBRIDE, DONALD S. MR. | FAIRFAX | VA | 220331122 | NaN | VICE PRESIDENT COMMUNICATIONS | 1000.0 | 29-SEP-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 751678 | Republican |
535795 | C00500587 | P20003281 | Perry, Rick | SHREWSBERRY, HAROLD MR. | FAIRFAX | VA | 220305833 | NaN | ASSISTANT INSPECTOR GENERAL | 2500.0 | 19-SEP-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 751678 | Republican |
536029 | C00500587 | P20003281 | Perry, Rick | TEXAS ENERGY L.L.C. | CHEYENNE | WY | 820013163 | NaN | LLC | 250.0 | 30-SEP-11 | NOT PROVIDE | X | SEE ATTRIBUTION BELOW | SA17A | 761750 | Republican |
10280 rows × 17 columns
# 删除NAN值
amt_election.dropna(axis=0,inplace=True)
amt_election[amt_election['contbr_employer'].isna()]
D:\yingyong\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
cmte_id | cand_id | cand_nm | contbr_nm | contbr_city | contbr_st | contbr_zip | contbr_employer | contbr_occupation | contb_receipt_amt | contb_receipt_dt | receipt_desc | memo_cd | memo_text | form_tp | file_num | party |
---|
# 再进行聚合操作
amt_election.pivot_table('contb_receipt_amt',
index=['contbr_occupation','contbr_employer'],
aggfunc='sum')
contb_receipt_amt | ||
---|---|---|
contbr_occupation | contbr_employer | |
MIXED-MEDIA ARTIST / STORYTELLER | MOVINGSTORIES LLC | 100.00 |
AREA VICE PRESIDENT | TIME WARNER CABLE | 250.00 |
RESEARCH ASSOCIATE | HARVARD MEDICAL SCHOOL | 100.00 |
TEACHER | DREW SCHOOL | 500.00 |
THERAPIST | SEQUOIA COUNSELING SERVICES | 3900.00 |
... | ... | ... |
ZIG ZAG RESTAURANT GROUP | ZIG ZAG RESTAURANT GROUP | 950.00 |
ZIMMERMANS DAIRY | ZIMMERMANS DAIRY | 83.71 |
ZOMBIE SLAYER | ATLANTA ZOMBIE SERVICES, LLC | 1556.00 |
ZOOLOGIST | SMITH INST | 100.00 |
\NONE\ | NONE | 250.00 |
124369 rows × 1 columns
s4 = Obama.pivot_table('contb_receipt_amt',
index=['contbr_occupation','contbr_employer'],
aggfunc='sum')
s4
contb_receipt_amt | ||
---|---|---|
contbr_occupation | contbr_employer | |
MIXED-MEDIA ARTIST / STORYTELLER | MOVINGSTORIES LLC | 100.0 |
AREA VICE PRESIDENT | TIME WARNER CABLE | 250.0 |
RESEARCH ASSOCIATE | HARVARD MEDICAL SCHOOL | 100.0 |
TEACHER | DREW SCHOOL | 500.0 |
THERAPIST | SEQUOIA COUNSELING SERVICES | 3900.0 |
... | ... | ... |
YOUTH SPECIALIST | LINCOLN UNIVERSITY | 200.0 |
NH-JAG | 325.0 | |
ZEN BUDDHIST PRIEST | ROCHESTER ZEN CENTER | 300.0 |
ZEPPOS AND ASSOCIATES | FOUNDER AND CEO | 1000.0 |
ZOOLOGIST | SMITH INST | 100.0 |
59886 rows × 1 columns
s4.sort_values('contb_receipt_amt',ascending=False)
contb_receipt_amt | ||
---|---|---|
contbr_occupation | contbr_employer | |
RETIRED | RETIRED | 12390060.73 |
NOT PROVIDE | NOT PROVIDE | 4529180.78 |
INFORMATION REQUESTED | INFORMATION REQUESTED | 2559801.28 |
HOMEMAKER | HOMEMAKER | 1996712.95 |
RETIRED | NOT EMPLOYED | 1851167.34 |
... | ... | ... |
INDUSTRIAL & COMMERCIAL NATIONAL TITLE | RETIRED | 3.00 |
EXECUTIVE DIRECTOR | LOCAL INITIATIVES SUPPORT CORPORATION, | 3.00 |
ART GALLERY | SELBY FLEETWOOD GALLERY | 3.00 |
POLICY/ LAWYER | U.S. DEPARTMENT OF LABOR | 3.00 |
MANAGEMENT CONSULTANT | HARBERT BERNARD & ASSOCIATES, LLC | 3.00 |
59927 rows × 1 columns
# Obama的数据,取前10条,画图
s4.sort_values('contb_receipt_amt',ascending=False)[:10].plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x6173a90>
# 雇主
s4.sort_values('contbr_employer',ascending=False)[:10].plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x8793e70>
4.时间处理
- 4.1str转datetime
amt_election.dtypes
cmte_id object
cand_id object
cand_nm object
contbr_nm object
contbr_city object
contbr_st object
contbr_zip object
contbr_employer object
contbr_occupation object
contb_receipt_amt float64
contb_receipt_dt object
receipt_desc object
memo_cd object
memo_text object
form_tp object
file_num int64
party object
dtype: object
amt_election['contb_receipt_dt'].unique()
array(['20-JUN-11', '23-JUN-11', '05-JUL-11', '01-AUG-11', '21-JUN-11',
'22-JUN-11', '17-JUN-11', '11-JUL-11', '08-JUL-11', '27-JUN-11',
'13-JUN-11', '16-JUN-11', '18-JUL-11', '05-AUG-11', '01-JUL-11',
'06-JUL-11', '13-JUL-11', '28-JUN-11', '30-JUN-11', '08-AUG-11',
'15-AUG-11', '16-AUG-11', '18-AUG-11', '24-JUN-11', '11-OCT-11',
'14-JUL-11', '29-JUN-11', '18-OCT-11', '30-SEP-11', '11-AUG-11',
'02-AUG-11', '14-JUN-11', '23-AUG-11', '07-JUL-11', '25-JUL-11',
'21-JUL-11', '24-OCT-11', '12-JUL-11', '28-JUL-11', '09-SEP-11',
'10-NOV-11', '12-SEP-11', '04-OCT-11', '05-DEC-11', '31-OCT-11',
'12-DEC-11', '29-JUL-11', '22-JUL-11', '27-OCT-11', '28-NOV-11',
'26-AUG-11', '26-SEP-11', '27-JUL-11', '28-APR-11', '29-DEC-11',
'28-JAN-12', '29-NOV-11', '21-AUG-11', '16-MAY-11', '18-MAY-11',
'12-MAY-11', '19-MAY-11', '19-SEP-11', '16-APR-11', '15-MAY-11',
'15-JUN-11', '11-SEP-11', '06-SEP-11', '15-OCT-11', '20-NOV-11',
'22-NOV-11', '15-DEC-11', '27-DEC-11', '02-DEC-11', '06-DEC-11',
'09-DEC-11', '29-SEP-11', '28-SEP-11', '01-SEP-11', '15-SEP-11',
'09-AUG-11', '16-SEP-11', '14-SEP-11', '03-MAY-11', '02-SEP-11',
'07-SEP-11', '15-JUL-11', '26-JUL-11', '25-AUG-11', '02-MAY-11',
'23-JAN-12', '09-JAN-12', '11-JAN-12', '25-JAN-12', '05-JAN-12',
'13-JAN-12', '20-JAN-12', '15-JAN-12', '30-JAN-12', '16-JAN-12',
'04-JAN-12', '26-JAN-12', '27-JAN-12', '10-JAN-12', '12-JAN-12',
'22-JAN-12', '01-JAN-12', '30-DEC-11', '17-DEC-11', '08-NOV-11',
'25-OCT-11', '17-NOV-11', '07-NOV-11', '13-OCT-11', '18-NOV-11',
'22-DEC-11', '17-JAN-12', '21-DEC-11', '07-DEC-11', '29-OCT-11',
'10-OCT-11', '17-AUG-11', '24-MAY-11', '06-JUN-11', '31-MAY-11',
'29-APR-11', '13-APR-11', '27-SEP-11', '17-SEP-11', '30-AUG-11',
'20-SEP-11', '29-AUG-11', '11-JUN-11', '23-MAY-11', '08-SEP-11',
'22-SEP-11', '22-AUG-11', '12-AUG-11', '27-MAY-11', '19-APR-11',
'24-AUG-11', '19-AUG-11', '28-AUG-11', '21-SEP-11', '10-AUG-11',
'20-JUL-11', '21-NOV-11', '02-OCT-11', '19-OCT-11', '18-JAN-12',
'24-JAN-12', '09-NOV-11', '14-NOV-11', '16-DEC-11', '28-OCT-11',
'31-DEC-11', '02-NOV-11', '25-NOV-11', '23-DEC-11', '17-OCT-11',
'10-DEC-11', '01-DEC-11', '13-DEC-11', '30-NOV-11', '12-OCT-11',
'15-NOV-11', '14-DEC-11', '16-NOV-11', '28-DEC-11', '20-DEC-11',
'03-NOV-11', '20-OCT-11', '01-NOV-11', '03-JAN-12', '07-JAN-12',
'31-JAN-12', '19-JAN-12', '13-SEP-11', '01-JUN-11', '11-APR-11',
'07-JUN-11', '02-JAN-12', '08-JAN-12', '29-JAN-12', '08-DEC-11',
'19-DEC-11', '19-JUL-11', '06-JAN-12', '21-JAN-12', '06-OCT-11',
'06-NOV-11', '19-NOV-11', '27-NOV-11', '23-NOV-11', '21-OCT-11',
'23-OCT-11', '25-DEC-11', '07-OCT-11', '04-NOV-11', '16-OCT-11',
'05-NOV-11', '30-OCT-11', '14-OCT-11', '04-DEC-11', '01-OCT-11',
'05-OCT-11', '26-NOV-11', '03-OCT-11', '26-DEC-11', '04-AUG-11',
'23-SEP-11', '31-AUG-11', '03-AUG-11', '16-JUL-11', '14-AUG-11',
'05-SEP-11', '10-SEP-11', '04-JUL-11', '24-SEP-11', '25-SEP-11',
'13-MAY-11', '26-APR-11', '06-MAY-11', '17-MAY-11', '10-JUN-11',
'15-APR-11', '10-MAY-11', '03-JUN-11', '12-APR-11', '05-MAY-11',
'09-MAY-11', '20-APR-11', '25-APR-11', '25-JUN-11', '25-MAY-11',
'12-JUN-11', '07-MAY-11', '04-MAY-11', '14-APR-11', '14-MAY-11',
'20-MAY-11', '18-JUN-11', '24-JUL-11', '14-JAN-12', '11-DEC-11',
'13-NOV-11', '18-DEC-11', '24-DEC-11', '11-NOV-11', '26-OCT-11',
'03-DEC-11', '08-OCT-11', '24-NOV-11', '12-NOV-11', '23-JUL-11',
'20-AUG-11', '18-SEP-11', '10-JUL-11', '07-AUG-11', '13-AUG-11',
'22-APR-11', '19-JUN-11', '09-JUN-11', '08-JUN-11', '26-MAY-11',
'27-APR-11', '18-APR-11', '17-APR-11', '24-APR-11', '21-APR-11',
'04-JUN-11', '22-MAY-11', '26-JUN-11', '28-MAY-11', '01-MAY-11',
'11-MAY-11', '30-APR-11', '02-JUN-11', '30-MAY-11', '29-MAY-11',
'05-JUN-11', '23-APR-11', '06-AUG-11', '27-AUG-11', '21-MAY-11',
'03-JUL-11', '31-JUL-11', '09-OCT-11', '22-OCT-11', '17-JUL-11',
'30-JUL-11', '02-JUL-11', '08-MAY-11', '04-SEP-11', '09-JUL-11',
'03-SEP-11', '05-APR-11', '04-APR-11', '08-APR-11', '10-APR-11',
'06-APR-11', '07-APR-11', '09-APR-11', '24-MAR-11', '07-MAR-11',
'05-MAR-11', '16-MAR-11', '04-MAR-11', '15-MAR-11', '17-MAR-11',
'03-MAR-11', '06-MAR-11', '10-MAR-11', '08-MAR-11', '25-MAR-11',
'22-MAR-11', '12-MAR-11', '13-MAR-11', '09-MAR-11', '11-MAR-11',
'29-MAR-11', '30-MAR-11', '18-MAR-11', '20-MAR-11', '02-MAR-11',
'21-MAR-11', '31-MAR-11', '03-APR-11', '14-MAR-11', '19-MAR-11',
'28-MAR-11', '23-MAR-11', '02-APR-11', '26-MAR-11', '01-APR-11',
'09-FEB-11', '28-FEB-11', '07-FEB-11', '27-MAR-11', '22-FEB-11',
'14-JAN-11', '17-JAN-11', '17-FEB-11', '31-JAN-11', '12-JAN-11',
'29-JAN-11', '15-JAN-11', '20-JAN-11', '21-JAN-11', '11-FEB-11',
'16-FEB-11', '28-JAN-11', '18-FEB-11', '23-FEB-11', '01-FEB-11',
'03-FEB-11', '04-FEB-11', '08-FEB-11', '13-JAN-11', '16-JAN-11',
'12-FEB-11', '13-FEB-11', '04-JAN-11', '10-FEB-11', '24-FEB-11',
'22-JAN-11', '24-JAN-11', '26-JAN-11', '01-JAN-11', '03-JAN-11',
'01-MAR-11', '27-JAN-11', '15-FEB-11', '14-FEB-11', '21-FEB-11',
'18-JAN-11'], dtype=object)
# 扩展:字符串转时间
from datetime import datetime
datetime.now()
datetime.datetime(2022, 4, 14, 10, 32, 8, 500374)
datetime.now().strftime('%d-%B-%y')
'14-April-22'
# strftime是一种计算机函数,根据区域设置格式化本地时间/日期,函数的功能:将时间格式化
# strptime(),功能:按照特定时间格式将字符串转换(解析)为时间类型。
# pd.to_datetime(obama['contb_receipt_dt']) 太慢了
# 封装函数
def covert_date(item):
return datetime.strptime(item,'%d-%B-%y')
# 字母变小写,然后首字母大写
'14-April-22'.lower()
'14-april-22'
'14-April-22'.lower().title()
'14-April-22'
# 封装函数
def covert_date(item):
return datetime.strptime(item.lower().title(),'%d-%b-%y')
Obama['amt_dt'] = Obama['contb_receipt_dt'].map(covert_date)
D:\yingyong\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""Entry point for launching an IPython kernel.
Obama['amt_dt']
58829 2011-12-11
58831 2011-11-25
58832 2011-12-16
58833 2012-01-24
58834 2012-01-12
...
351224 2011-11-26
351225 2012-01-26
351226 2011-12-30
351227 2011-12-30
351228 2011-10-26
Name: amt_dt, Length: 289999, dtype: datetime64[ns]
- 4.2以时间为索引
# 排序
Obama.sort_values('amt_dt')
cmte_id | cand_id | cand_nm | contbr_nm | contbr_city | contbr_st | contbr_zip | contbr_employer | contbr_occupation | contb_receipt_amt | contb_receipt_dt | receipt_desc | memo_cd | memo_text | form_tp | file_num | party | amt_dt | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
61913 | C00431445 | P80003338 | Obama, Barack | SMITH, WILLIS B | TEXARKANA | AR | 718547720 | RETIRED | FORMER ATTORNEY | 100.0 | 04-APR-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 756214 | Democrat | 2011-04-04 |
137424 | C00431445 | P80003338 | Obama, Barack | FISHER, TAD | JACKSONVILLE | FL | 322251084 | FL ACADEMY OF FAMILY PHYSICIANS | CEO | 100.0 | 04-APR-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 756214 | Democrat | 2011-04-04 |
137407 | C00431445 | P80003338 | Obama, Barack | COMMESSO, CATHY | PALM HARBOR | FL | 346852225 | NOT EMPLOYED | NOT EMPLOYED | 100.0 | 04-APR-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 756214 | Democrat | 2011-04-04 |
137388 | C00431445 | P80003338 | Obama, Barack | DE HAAN, RUTH | HOLMES BEACH | FL | 34217 | RETIRED | RETIRED | 50.0 | 04-APR-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 756214 | Democrat | 2011-04-04 |
71701 | C00431445 | P80003338 | Obama, Barack | GONZALEZ, DAVID | SAN FRANCISCO | CA | 941072023 | GENENTECH | SR. HUMAN RESOURCE MANAGER | 100.0 | 04-APR-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 756214 | Democrat | 2011-04-04 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
241140 | C00431445 | P80003338 | Obama, Barack | UZUN, OZCAN | PISCATAWAY | NJ | 08854 | NYIT AND CUNY | INSTRUCTOR AND BUSINESS OWNER | 2500.0 | 31-JAN-12 | NOT PROVIDE | X | * OBAMA VICTORY FUND 2012 | SA18 | 766535 | Democrat | 2012-01-31 |
221876 | C00431445 | P80003338 | Obama, Barack | ROESLER, DEBORAH | WAYZATA | MN | 55391 | NOT EMPLOYED | RETIRED | 500.0 | 31-JAN-12 | NOT PROVIDE | X | * OBAMA VICTORY FUND 2012 | SA18 | 766535 | Democrat | 2012-01-31 |
298651 | C00431445 | P80003338 | Obama, Barack | KNOLL, SANDRA L | PHOENIXVILLE | PA | 194602272 | INFORMATION REQUESTED | INFORMATION REQUESTED | 150.0 | 31-JAN-12 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 766535 | Democrat | 2012-01-31 |
113520 | C00431445 | P80003338 | Obama, Barack | TORBOHN, GRACE | MODESTO | CA | 953560436 | RETIRED | RETIRED | 55.0 | 31-JAN-12 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 766535 | Democrat | 2012-01-31 |
231259 | C00431445 | P80003338 | Obama, Barack | VRANA, BRUCE | DURHAM | NC | 277072835 | SYNGENTA | ATTORNEY | 250.0 | 31-JAN-12 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 766535 | Democrat | 2012-01-31 |
289999 rows × 18 columns
# 设置索引
obama_amt_dt = Obama.sort_values('amt_dt').set_index('amt_dt')
obama_amt_dt.head()
cmte_id | cand_id | cand_nm | contbr_nm | contbr_city | contbr_st | contbr_zip | contbr_employer | contbr_occupation | contb_receipt_amt | contb_receipt_dt | receipt_desc | memo_cd | memo_text | form_tp | file_num | party | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
amt_dt | |||||||||||||||||
2011-04-04 | C00431445 | P80003338 | Obama, Barack | SMITH, WILLIS B | TEXARKANA | AR | 718547720 | RETIRED | FORMER ATTORNEY | 100.0 | 04-APR-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 756214 | Democrat |
2011-04-04 | C00431445 | P80003338 | Obama, Barack | FISHER, TAD | JACKSONVILLE | FL | 322251084 | FL ACADEMY OF FAMILY PHYSICIANS | CEO | 100.0 | 04-APR-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 756214 | Democrat |
2011-04-04 | C00431445 | P80003338 | Obama, Barack | COMMESSO, CATHY | PALM HARBOR | FL | 346852225 | NOT EMPLOYED | NOT EMPLOYED | 100.0 | 04-APR-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 756214 | Democrat |
2011-04-04 | C00431445 | P80003338 | Obama, Barack | DE HAAN, RUTH | HOLMES BEACH | FL | 34217 | RETIRED | RETIRED | 50.0 | 04-APR-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 756214 | Democrat |
2011-04-04 | C00431445 | P80003338 | Obama, Barack | GONZALEZ, DAVID | SAN FRANCISCO | CA | 941072023 | GENENTECH | SR. HUMAN RESOURCE MANAGER | 100.0 | 04-APR-11 | NOT PROVIDE | NOT PROVIDE | NOT PROVIDE | SA17A | 756214 | Democrat |
obama_amt_dt.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x1065fe70>
obama_amt_dt['contb_receipt_amt'].plot()
<matplotlib.axes._subplots.AxesSubplot at 0x22286a70>
Obama.groupby('amt_dt')['contb_receipt_amt'].sum().plot()
<matplotlib.axes._subplots.AxesSubplot at 0x8779710>