需求
加载数据 查看数据的基本信息 指定数据截取,将如下字段的数据进行提取,其他数据舍弃
cand_nm :候选人姓名 contbr_nm : 捐赠人姓名 contbr_st :捐赠人所在州 contbr_employer : 捐赠人所在公司 contbr_occupation : 捐赠人职业 contb_receipt_amt :捐赠数额(美元) contb_receipt_dt : 捐款的日期 对新数据进行总览,查看是否存在缺失数据 用统计学指标快速描述数值型属性的概要。 空值处理。可能因为忘记填写或者保密等等原因,相关字段出现了空值,将其填充为NOT PROVIDE 异常值处理。将捐款金额<=0的数据删除 新建一列为各个候选人所在党派party 查看party这一列中有哪些不同的元素 统计party列中各个元素出现次数 查看各个党派收到的政治献金总数contb_receipt_amt 查看具体每天各个党派收到的政治献金总数contb_receipt_amt 将表中日期格式转换为’yyyy-mm-dd’。 查看老兵(捐献者职业)DISABLED VETERAN主要支持谁
import pandas as pd
months = { 'JAN' : 1 , 'FEB' : 2 , 'MAR' : 3 , 'APR' : 4 , 'MAY' : 5 , 'JUN' : 6 ,
'JUL' : 7 , 'AUG' : 8 , 'SEP' : 9 , 'OCT' : 10 , 'NOV' : 11 , 'DEC' : 12 }
of_interest = [ 'Obama, Barack' , 'Romney, Mitt' , 'Santorum, Rick' ,
'Paul, Ron' , 'Gingrich, Newt' ]
parties = {
'Bachmann, Michelle' : 'Republican' ,
'Romney, Mitt' : 'Republican' ,
'Obama, Barack' : 'Democrat' ,
"Roemer, Charles E. 'Buddy' III" : 'Reform' ,
'Pawlenty, Timothy' : 'Republican' ,
'Johnson, Gary Earl' : 'Libertarian' ,
'Paul, Ron' : 'Republican' ,
'Santorum, Rick' : 'Republican' ,
'Cain, Herman' : 'Republican' ,
'Gingrich, Newt' : 'Republican' ,
'McCotter, Thaddeus G' : 'Republican' ,
'Huntsman, Jon' : 'Republican' ,
'Perry, Rick' : 'Republican'
}
df = pd. read_csv( 'usa_election.txt' )
df
D:\ANACONDA\lib\site-packages\IPython\core\interactiveshell.py:3444: DtypeWarning: Columns (6) have mixed types.Specify dtype option on import or set low_memory=False.
exec(code_obj, self.user_global_ns, self.user_ns)
cmte_id cand_id cand_nm contbr_nm contbr_city contbr_st contbr_zip contbr_employer contbr_occupation contb_receipt_amt contb_receipt_dt receipt_desc memo_cd memo_text form_tp file_num 0 C00410118 P20002978 Bachmann, Michelle HARVEY, WILLIAM MOBILE AL 366010290.0 RETIRED RETIRED 250.0 20-JUN-11 NaN NaN NaN SA17A 736166 1 C00410118 P20002978 Bachmann, Michelle HARVEY, WILLIAM MOBILE AL 366010290.0 RETIRED RETIRED 50.0 23-JUN-11 NaN NaN NaN SA17A 736166 2 C00410118 P20002978 Bachmann, Michelle SMITH, LANIER LANETT AL 368633403.0 INFORMATION REQUESTED INFORMATION REQUESTED 250.0 05-JUL-11 NaN NaN NaN SA17A 749073 3 C00410118 P20002978 Bachmann, Michelle BLEVINS, DARONDA PIGGOTT AR 724548253.0 NONE RETIRED 250.0 01-AUG-11 NaN NaN NaN SA17A 749073 4 C00410118 P20002978 Bachmann, Michelle WARDENBURG, HAROLD HOT SPRINGS NATION AR 719016467.0 NONE RETIRED 300.0 20-JUN-11 NaN NaN NaN SA17A 736166 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 536036 C00500587 P20003281 Perry, Rick ANDERSON, MARILEE MRS. INFO REQUESTED XX 99999 INFORMATION REQUESTED PER BEST EFFORTS INFORMATION REQUESTED PER BEST EFFORTS 2500.0 31-AUG-11 NaN NaN NaN SA17A 751678 536037 C00500587 P20003281 Perry, Rick TOLBERT, DARYL MR. INFO REQUESTED XX 99999 T.A.C.C. LONGWALL MAINTENANCE FOREMAN 500.0 30-SEP-11 NaN NaN NaN SA17A 751678 536038 C00500587 P20003281 Perry, Rick GRANE, BRYAN F. MR. INFO REQUESTED XX 99999 INFORMATION REQUESTED PER BEST EFFORTS INFORMATION REQUESTED PER BEST EFFORTS 500.0 29-SEP-11 NaN NaN NaN SA17A 751678 536039 C00500587 P20003281 Perry, Rick DUFFY, DAVID A. MR. INFO REQUESTED XX 99999 DUFFY EQUIPMENT COMPANY INC. BUSINESS OWNER 2500.0 30-SEP-11 NaN NaN NaN SA17A 751678 536040 C00500587 P20003281 Perry, Rick GORMAN, CHRIS D. MR. INFO REQUESTED XX 99999 INFORMATION REQUESTED PER BEST EFFORTS INFORMATION REQUESTED PER BEST EFFORTS 5000.0 29-SEP-11 REATTRIBUTION / REDESIGNATION REQUESTED (AUTOM... NaN REATTRIBUTION / REDESIGNATION REQUESTED (AUTOM... SA17A 751678
536041 rows × 16 columns
df. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 536041 entries, 0 to 536040
Data columns (total 16 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 cmte_id 536041 non-null object
1 cand_id 536041 non-null object
2 cand_nm 536041 non-null object
3 contbr_nm 536041 non-null object
4 contbr_city 536026 non-null object
5 contbr_st 536040 non-null object
6 contbr_zip 535973 non-null object
7 contbr_employer 525088 non-null object
8 contbr_occupation 530520 non-null object
9 contb_receipt_amt 536041 non-null float64
10 contb_receipt_dt 536041 non-null object
11 receipt_desc 8479 non-null object
12 memo_cd 49718 non-null object
13 memo_text 52740 non-null object
14 form_tp 536041 non-null object
15 file_num 536041 non-null int64
dtypes: float64(1), int64(1), object(14)
memory usage: 65.4+ MB
df. describe( )
contb_receipt_amt file_num count 5.360410e+05 536041.000000 mean 3.750373e+02 761472.107800 std 3.564436e+03 5148.893508 min -3.080000e+04 723511.000000 25% 5.000000e+01 756218.000000 50% 1.000000e+02 763233.000000 75% 2.500000e+02 763621.000000 max 1.944042e+06 767394.000000
df. fillna( value= 'NOT PROVIDE' , inplace= True )
df. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 536041 entries, 0 to 536040
Data columns (total 16 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 cmte_id 536041 non-null object
1 cand_id 536041 non-null object
2 cand_nm 536041 non-null object
3 contbr_nm 536041 non-null object
4 contbr_city 536041 non-null object
5 contbr_st 536041 non-null object
6 contbr_zip 536041 non-null object
7 contbr_employer 536041 non-null object
8 contbr_occupation 536041 non-null object
9 contb_receipt_amt 536041 non-null float64
10 contb_receipt_dt 536041 non-null object
11 receipt_desc 536041 non-null object
12 memo_cd 536041 non-null object
13 memo_text 536041 non-null object
14 form_tp 536041 non-null object
15 file_num 536041 non-null int64
dtypes: float64(1), int64(1), object(14)
memory usage: 65.4+ MB
drop_index = df. loc[ df[ 'contb_receipt_amt' ] <= 0 ] . index
df. drop( labels= drop_index, axis= 0 , inplace= True )
df[ 'party' ] = df[ 'cand_nm' ] . map ( parties)
df. head( )
cmte_id cand_id cand_nm contbr_nm contbr_city contbr_st contbr_zip contbr_employer contbr_occupation contb_receipt_amt contb_receipt_dt receipt_desc memo_cd memo_text form_tp file_num party 0 C00410118 P20002978 Bachmann, Michelle HARVEY, WILLIAM MOBILE AL 366010290.0 RETIRED RETIRED 250.0 20-JUN-11 NOT PROVIDE NOT PROVIDE NOT PROVIDE SA17A 736166 Republican 1 C00410118 P20002978 Bachmann, Michelle HARVEY, WILLIAM MOBILE AL 366010290.0 RETIRED RETIRED 50.0 23-JUN-11 NOT PROVIDE NOT PROVIDE NOT PROVIDE SA17A 736166 Republican 2 C00410118 P20002978 Bachmann, Michelle SMITH, LANIER LANETT AL 368633403.0 INFORMATION REQUESTED INFORMATION REQUESTED 250.0 05-JUL-11 NOT PROVIDE NOT PROVIDE NOT PROVIDE SA17A 749073 Republican 3 C00410118 P20002978 Bachmann, Michelle BLEVINS, DARONDA PIGGOTT AR 724548253.0 NONE RETIRED 250.0 01-AUG-11 NOT PROVIDE NOT PROVIDE NOT PROVIDE SA17A 749073 Republican 4 C00410118 P20002978 Bachmann, Michelle WARDENBURG, HAROLD HOT SPRINGS NATION AR 719016467.0 NONE RETIRED 300.0 20-JUN-11 NOT PROVIDE NOT PROVIDE NOT PROVIDE SA17A 736166 Republican
df[ 'party' ] . unique( )
array(['Republican', 'Democrat', 'Reform', 'Libertarian'], dtype=object)
df[ 'party' ] . value_counts( )
Democrat 289999
Republican 234300
Reform 5313
Libertarian 702
Name: party, dtype: int64
df. groupby( by= 'party' ) [ 'contb_receipt_amt' ] . sum ( )
party
Democrat 8.259441e+07
Libertarian 4.132769e+05
Reform 3.429658e+05
Republican 1.251181e+08
Name: contb_receipt_amt, dtype: float64
df. groupby( by= [ 'contb_receipt_dt' , 'party' ] ) [ 'contb_receipt_amt' ] . sum ( )
contb_receipt_dt party
01-APR-11 Reform 50.00
Republican 12635.00
01-AUG-11 Democrat 182198.00
Libertarian 1000.00
Reform 1847.00
...
31-MAY-11 Republican 313839.80
31-OCT-11 Democrat 216971.87
Libertarian 4250.00
Reform 3205.00
Republican 751542.36
Name: contb_receipt_amt, Length: 1183, dtype: float64
df. head( )
def transformData ( d) :
day, month, year = d. split( '-' )
month = months[ month]
return '20' + year + '-' + str ( month) + '-' + day
df[ 'contb_receipt_dt' ] = df[ 'contb_receipt_dt' ] . map ( transformData)
df. head( )
cmte_id cand_id cand_nm contbr_nm contbr_city contbr_st contbr_zip contbr_employer contbr_occupation contb_receipt_amt contb_receipt_dt receipt_desc memo_cd memo_text form_tp file_num party 0 C00410118 P20002978 Bachmann, Michelle HARVEY, WILLIAM MOBILE AL 366010290.0 RETIRED RETIRED 250.0 2011-6-20 NOT PROVIDE NOT PROVIDE NOT PROVIDE SA17A 736166 Republican 1 C00410118 P20002978 Bachmann, Michelle HARVEY, WILLIAM MOBILE AL 366010290.0 RETIRED RETIRED 50.0 2011-6-23 NOT PROVIDE NOT PROVIDE NOT PROVIDE SA17A 736166 Republican 2 C00410118 P20002978 Bachmann, Michelle SMITH, LANIER LANETT AL 368633403.0 INFORMATION REQUESTED INFORMATION REQUESTED 250.0 2011-7-05 NOT PROVIDE NOT PROVIDE NOT PROVIDE SA17A 749073 Republican 3 C00410118 P20002978 Bachmann, Michelle BLEVINS, DARONDA PIGGOTT AR 724548253.0 NONE RETIRED 250.0 2011-8-01 NOT PROVIDE NOT PROVIDE NOT PROVIDE SA17A 749073 Republican 4 C00410118 P20002978 Bachmann, Michelle WARDENBURG, HAROLD HOT SPRINGS NATION AR 719016467.0 NONE RETIRED 300.0 2011-6-20 NOT PROVIDE NOT PROVIDE NOT PROVIDE SA17A 736166 Republican
df_old = df. loc[ df[ 'contbr_occupation' ] == 'DISABLED VETERAN' ]
df_old. groupby( by= 'cand_nm' ) [ 'contb_receipt_amt' ] . sum ( )
cand_nm
Cain, Herman 300.00
Obama, Barack 4205.00
Paul, Ron 2425.49
Santorum, Rick 250.00
Name: contb_receipt_amt, dtype: float64
参考视频链接:https://www.bilibili.com/video/BV1Bi4y1m7k7/?p=27&share_source=copy_web&vd_source=1170c577d779798202386e1f343fe38b