数据集的处理、数据探索与清晰、数据分析、数据可视化四部分
1.数据处理
进行数据处理前,我们需要知道我们最终想要的数据是什么样的,因为我们是想分析候选人与捐赠人之间的关系,所以我们想要一张数据表中有捐赠人与候选人一一对应的关系,所以需要将目前的三张数据表进行一一关联,汇总到需要的数据
import pandas as pd
import numpy as np
import matplotlib. pyplot as plt
1.1 将委员会和候选人一一对应,通过CAND_ID关联两个表
由于候选人和委员会的联系表中无候选人姓名,只有候选人ID(CAND_ID),所以需要通过CAND_ID从候选人表中获取到候选人姓名,最终得到候选人与委员会联系表ccl。
CAND_ID 候选人ID
CAND_NAME 候选人姓名
CAND_PTY_AFFILIATION 候选人党派
CAND_ID 候选人ID
CAND_ELECTION_YR 候选人选举年份
CMTE_ID 委员会ID
CMTE_ID 委员会ID
NAME 捐款人姓名
CITY 捐款人所在市
State 捐款人所在州
EMPLOYER 捐款人雇主/公司
OCCUPATION 捐款人职业
candidates = pd. read_csv( "./weball20.txt" , sep = '|' , names= [ 'CAND_ID' , 'CAND_NAME' , 'CAND_ICI' , 'PTY_CD' , 'CAND_PTY_AFFILIATION' , 'TTL_RECEIPTS' ,
'TRANS_FROM_AUTH' , 'TTL_DISB' , 'TRANS_TO_AUTH' , 'COH_BOP' , 'COH_COP' , 'CAND_CONTRIB' ,
'CAND_LOANS' , 'OTHER_LOANS' , 'CAND_LOAN_REPAY' , 'OTHER_LOAN_REPAY' , 'DEBTS_OWED_BY' ,
'TTL_INDIV_CONTRIB' , 'CAND_OFFICE_ST' , 'CAND_OFFICE_DISTRICT' , 'SPEC_ELECTION' , 'PRIM_ELECTION' , 'RUN_ELECTION'
, 'GEN_ELECTION' , 'GEN_ELECTION_PRECENT' , 'OTHER_POL_CMTE_CONTRIB' , 'POL_PTY_CONTRIB' ,
'CVG_END_DT' , 'INDIV_REFUNDS' , 'CMTE_REFUNDS' ] )
ccl = pd. read_csv( "./ccl.txt" , sep = '|' , names= [ 'CAND_ID' , 'CAND_ELECTION_YR' , 'FEC_ELECTION_YR' , 'CMTE_ID' , 'CMTE_TP' , 'CMTE_DSGN' , 'LINKAGE_ID' ] )
ccl = pd. merge( ccl, candidates)
ccl = pd. DataFrame( ccl, columns= [ 'CMTE_ID' , 'CAND_ID' , 'CAND_NAME' , 'CAND_PTY_AFFILIATION' ] )
ccl. head( 10 )
CMTE_ID CAND_ID CAND_NAME CAND_PTY_AFFILIATION 0 C00697789 H0AL01055 CARL, JERRY LEE, JR REP 1 C00701557 H0AL01063 LAMBERT, DOUGLAS WESTLEY III REP 2 C00701409 H0AL01071 PRINGLE, CHRISTOPHER PAUL REP 3 C00703066 H0AL01089 HIGHTOWER, BILL REP 4 C00708867 H0AL01097 AVERHART, JAMES DEM 5 C00710947 H0AL01105 GARDNER, KIANI A DEM 6 C00722512 H0AL01121 CASTORANI, JOHN REP 7 C00725069 H0AL01139 COLLINS, FREDERICK G. RICK' DEM 8 C00462143 H0AL02087 ROBY, MARTHA REP 9 C00493783 H0AL02087 ROBY, MARTHA REP
itcont = pd. read_csv( './by_date/itcont_2020_20200712_20200723.txt' , sep= '|' , names= [ 'CMTE_ID' , 'AMNDT_IND' , 'RPT_TP' , 'TRANSACTION_PGI' ,
'IMAGE_NUM' , 'TRANSACTION_TP' , 'ENTITY_TP' , 'NAME' , 'CITY' ,
'STATE' , 'ZIP_CODE' , 'EMPLOYER' , 'OCCUPATION' , 'TRANSACTION_DT' ,
'TRANSACTION_AMT' , 'OTHER_ID' , 'TRAN_ID' , 'FILE_NUM' , 'MEMO_CD' ,
'MEMO_TEXT' , 'SUB_ID' ] )
/home/dkky528_1/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3165: DtypeWarning: Columns (10,18) have mixed types.Specify dtype option on import or set low_memory=False.
has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
c_itcont = pd. merge( ccl, itcont)
c_itcont = pd. DataFrame( c_itcont, columns= [ 'CAND_NAME' , 'NAME' , 'STATE' , 'EMPLOYER' , 'OCCUPATION' ,
'TRANSACTION_AMT' , 'TRANSACTION_DT' , 'CAND_PTY_AFFILIATION' ] )
数据说明
CAND_NAME – 接受捐赠的候选人姓名
NAME – 捐赠人姓名
STATE – 捐赠人所在州
EMPLOYER – 捐赠人所在公司
OCCUPATION – 捐赠人职业
TRANSACTION_AMT – 捐赠数额(美元)
TRANSACTION_DT – 收到捐款的日期
CAND_PTY_AFFILIATION – 候选人党派
c_itcont. head( 10 )
CAND_NAME NAME STATE EMPLOYER OCCUPATION TRANSACTION_AMT TRANSACTION_DT CAND_PTY_AFFILIATION 0 CARL, JERRY LEE, JR ORRELL, SIDNEY J. JR. AL RETIRED RETIRED 25 7132020 REP 1 CARL, JERRY LEE, JR GRIGGS, STEPHEN FL AEROCARE USA PRESIDENT 2800 7152020 REP 2 CARL, JERRY LEE, JR GERLING, TARA MO FAMILY FACTOR PATIENT ADVOCATE 200 7172020 REP 3 CARL, JERRY LEE, JR MILLER, BOYD AL CABANISS & JOHNSTON ATTORNEY 250 7172020 REP 4 CARL, JERRY LEE, JR SIDDIQUI, SHAHRURH AL PROSPEROUS VENTURES LLC NaN 1000 7202020 REP 5 CARL, JERRY LEE, JR GALLASPY, GLENN AL RETIRED RETIRED 250 7172020 REP 6 CARL, JERRY LEE, JR HOWARD, STEWART L. AL HOWARD FESTA LLP PARTNER 1000 7132020 REP 7 HIGHTOWER, BILL HILLMAN, TATNALL CO NaN NaN 2800 7152020 REP 8 HIGHTOWER, BILL HILLMAN, TATNALL CO NaN NaN 2800 7152020 REP 9 HIGHTOWER, BILL WINTERSTEEN, JIM CA NaN NaN 2800 7152020 REP
2. 数据探索与清洗
调用shape属性查看数据的规模,调用info函数查看数据信息,调用describe函数查看数据分布
c_itcont. shape
(405152, 8)
c_itcont. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 405152 entries, 0 to 405151
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 CAND_NAME 405152 non-null object
1 NAME 405152 non-null object
2 STATE 405073 non-null object
3 EMPLOYER 369711 non-null object
4 OCCUPATION 392749 non-null object
5 TRANSACTION_AMT 405152 non-null int64
6 TRANSACTION_DT 405152 non-null int64
7 CAND_PTY_AFFILIATION 405152 non-null object
dtypes: int64(2), object(6)
memory usage: 27.8+ MB
c_itcont[ 'STATE' ] . fillna( 'NOT PROVIDED' , inplace= True )
c_itcont[ 'EMPLOYER' ] . fillna( 'NOT PROVIDED' , inplace= True )
c_itcont[ 'OCCUPATION' ] . fillna( 'NOT PROVIDED' , inplace= True )
c_itcont[ 'TRANSACTION_DT' ] = c_itcont[ 'TRANSACTION_DT' ] . astype( str )
c_itcont[ 'TRANSACTION_DT' ] = [ i[ 3 : 7 ] + i[ 0 ] + i[ 1 : 3 ] for i in c_itcont[ 'TRANSACTION_DT' ] ]
c_itcont. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 405152 entries, 0 to 405151
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 CAND_NAME 405152 non-null object
1 NAME 405152 non-null object
2 STATE 405152 non-null object
3 EMPLOYER 405152 non-null object
4 OCCUPATION 405152 non-null object
5 TRANSACTION_AMT 405152 non-null int64
6 TRANSACTION_DT 405152 non-null object
7 CAND_PTY_AFFILIATION 405152 non-null object
dtypes: int64(1), object(7)
memory usage: 27.8+ MB
c_itcont. head( 3 )
CAND_NAME NAME STATE EMPLOYER OCCUPATION TRANSACTION_AMT TRANSACTION_DT CAND_PTY_AFFILIATION 0 CARL, JERRY LEE, JR ORRELL, SIDNEY J. JR. AL RETIRED RETIRED 25 2020713 REP 1 CARL, JERRY LEE, JR GRIGGS, STEPHEN FL AEROCARE USA PRESIDENT 2800 2020715 REP 2 CARL, JERRY LEE, JR GERLING, TARA MO FAMILY FACTOR PATIENT ADVOCATE 200 2020717 REP
c_itcont. describe( )
TRANSACTION_AMT count 4.051520e+05 mean 1.718179e+02 std 4.972200e+03 min -5.600000e+03 25% 2.000000e+01 50% 3.500000e+01 75% 1.000000e+02 max 3.000000e+06
c_itcont[ 'CAND_NAME' ] . describe( )
count 405152
unique 1175
top BIDEN, JOSEPH R JR
freq 106303
Name: CAND_NAME, dtype: object
3.数据分析
c_itcont. groupby( "CAND_PTY_AFFILIATION" ) . sum ( ) . sort_values( "TRANSACTION_AMT" , ascending= False ) . head( 10 )
TRANSACTION_AMT CAND_PTY_AFFILIATION DEM 37279654 REP 28282701 OTH 3000215 DFL 568120 IND 396280 LIB 63667 GRE 13047 NPP 2000 UNK 1995 CON 1492
c_itcont. groupby( "CAND_NAME" ) . sum ( ) . sort_values( "TRANSACTION_AMT" , ascending= False ) . head( 10 )
TRANSACTION_AMT CAND_NAME BIDEN, JOSEPH R JR 10149060 SULLIVAN, DAN 3645743 WILSON, WILLIE 3000000 TRUMP, DONALD J. 2620822 JACOBS, CHRISTOPHER L. 2083750 GIDEON, SARA 1792023 MCSALLY, MARTHA 1319246 HARRISON, JAIME 1261889 KELLY, MARK 1139390 GOROFF, NANCY 1125116
获得捐赠最多的党派有DEM(民主党)、REP(共和党),分别对应BIDEN, JOSEPH R JR(拜登)和TRUMP, DONALD J.(特朗普)
c_itcont. groupby( 'OCCUPATION' ) . sum ( ) . sort_values( "TRANSACTION_AMT" , ascending= False ) . head( 10 )
TRANSACTION_AMT OCCUPATION RETIRED 15209674 NOT EMPLOYED 9275081 NOT PROVIDED 4810975 BUSINESS OWNER 3299087 ATTORNEY 2804185 CEO 1572726 PROFESSOR 1527392 HOMEMAKER 1225005 PHYSICIAN 1220865 PRESIDENT 1013452
c_itcont[ 'OCCUPATION' ] . value_counts( ) . head( 10 )
RETIRED 130653
NOT EMPLOYED 94869
NOT PROVIDED 12410
ATTORNEY 10230
PHYSICIAN 6772
PROFESSOR 4308
ENGINEER 3759
CONSULTANT 3629
TEACHER 3467
SALES 2458
Name: OCCUPATION, dtype: int64
从捐款人的角度来看,RETIRED(退休人员)的总捐赠额是最多的,其次是NOT EMPLOYED(自由职业)。
c_itcont. groupby( 'STATE' ) . sum ( ) . sort_values( "TRANSACTION_AMT" , ascending= False ) . head( 5 )
TRANSACTION_AMT STATE CA 9982172 NY 7492819 IL 4882727 TX 4855937 FL 4297560
c_itcont[ 'STATE' ] . value_counts( ) . head( 5 )
CA 65796
TX 32606
FL 26945
NY 25910
WA 14869
Name: STATE, dtype: int64
4.数据可视化
% matplotlib inline
from wordcloud import WordCloud, ImageColorGenerator
st_amt = c_itcont. groupby( 'STATE' ) . sum ( ) . sort_values( "TRANSACTION_AMT" , ascending= False ) [ : 10 ]
st_amt = pd. DataFrame( st_amt, columns= [ 'TRANSACTION_AMT' ] )
st_amt. plot( kind= 'bar' )
<AxesSubplot:xlabel='STATE'>
st_amt = c_itcont. groupby( 'STATE' ) . size( ) . sort_values( ascending= False ) . head( 10 )
st_amt. plot( kind= 'bar' )
<AxesSubplot:xlabel='STATE'>
biden = c_itcont[ c_itcont[ 'CAND_NAME' ] == 'BIDEN, JOSEPH R JR' ]
biden_state = biden. groupby( 'STATE' ) . sum ( ) . sort_values( 'TRANSACTION_AMT' , ascending= False ) . head( 10 )
biden_state. plot. pie( figsize= ( 10 , 10 ) , autopct= '%0.2f%%' , subplots= True )
array([<AxesSubplot:ylabel='TRANSACTION_AMT'>], dtype=object)
!wget https: // img. alicdn. com/ tfs/ TB10Jx4pBBh1e4jSZFhXXcC9VXa- 689 - 390. jpg
--2021-11-13 15:23:23-- https://img.alicdn.com/tfs/TB10Jx4pBBh1e4jSZFhXXcC9VXa-689-390.jpg
正在连接 192.168.1.20:1079... 已连接。
已发出 Proxy 请求,正在等待回应... 200 OK
长度: 4236 (4.1K) [image/jpeg]
正在保存至: “TB10Jx4pBBh1e4jSZFhXXcC9VXa-689-390.jpg”
TB10Jx4pBBh1e4jSZFh 100%[===================>] 4.14K --.-KB/s 用时 0s
2021-11-13 15:23:23 (384 MB/s) - 已保存 “TB10Jx4pBBh1e4jSZFhXXcC9VXa-689-390.jpg” [4236/4236])
6])
import os
os. rename( 'TB10Jx4pBBh1e4jSZFhXXcC9VXa-689-390.jpg' , 'biden.jpg' )
data = ' ' . join( biden[ 'NAME' ] . tolist( ) )
bg = plt. imread( 'biden.jpg' )
wc = WordCloud(
background_color= 'white' ,
width= 890 ,
height= 600 ,
mask= bg,
margin= 10 ,
max_font_size= 100 ,
random_state= 20 ,
) . generate_from_text( data)
bg_color = ImageColorGenerator( bg)
plt. imshow( wc. recolor( color_func= bg_color) )
plt. axis( 'off' )
wc. to_file( "biden_wordcloud.png" )
<wordcloud.wordcloud.WordCloud at 0x7fc9b447ca30>
st_amt = c_itcont. groupby( 'STATE' ) . sum ( ) . sort_values( "TRANSACTION_AMT" , ascending = False )
import seaborn as sb
import matplotlib. pyplot as plt
plt. figure( figsize= ( 8 , 8 ) )
sb. heatmap( st_amt, annot= True )
plt. show( )
firsttwo_name = c_itcont. groupby( "CAND_NAME" ) . sum ( ) . sort_values( "TRANSACTION_AMT" , ascending= False ) . iloc[ : 2 ]
BIDEN = c_itcont[ c_itcont[ "CAND_NAME" ] == firsttwo_name. index[ 0 ] ]
TRUMP = c_itcont[ c_itcont[ "CAND_NAME" ] == firsttwo_name. index[ 1 ] ]
x= BIDEN. groupby( "TRANSACTION_DT" ) [ "TRANSACTION_AMT" ] . sum ( ) . index
y1 = BIDEN. groupby( "TRANSACTION_DT" ) [ "TRANSACTION_AMT" ] . sum ( )
y2 = TRUMP. groupby( "TRANSACTION_DT" ) [ "TRANSACTION_AMT" ] . sum ( )
plt. figure( figsize= ( 20 , 8 ) )
plt. plot( x, y1, label= 'BIDEN' )
plt. plot( x, y2, label = 'TRUMP' )
plt. legend( )
plt. xlabel( "TRANSACTION_DT" )
plt. ylabel( "TRANSACTION_AMT" )
plt. xticks( rotation= 45 )
plt. show( )
BIDEN = c_itcont[ c_itcont[ "CAND_NAME" ] == "BIDEN, JOSEPH R JR" ] . groupby( "STATE" ) [ "TRANSACTION_AMT" ] . sum ( )
TRUMP = c_itcont[ c_itcont[ "CAND_NAME" ] == "TRUMP, DONALD J." ] . groupby( "STATE" ) [ "TRANSACTION_AMT" ] . sum ( )
reBIDEN= BIDEN[ BIDEN. index. isin( TRUMP. index) ]
reTRUMP= TRUMP[ TRUMP. index. isin( BIDEN. index) ]
x = reBIDEN. index
plt. figure( figsize= ( 20 , 8 ) )
bar_width = 0.4
index_reBIDEN = np. arange( len ( x) )
index_reTRUMP = index_reBIDEN + bar_width
plt. bar( index_reBIDEN, reBIDEN, width= bar_width, label= 'BIDEN' )
plt. bar( index_reTRUMP, reTRUMP, width= bar_width, label = 'TRUMP' )
plt. legend( )
plt. xticks( index_reBIDEN + bar_width/ 2 , x, rotation= 45 )
plt. xlabel( "STATE" )
plt. ylabel( "TRANSACTION_AMT" )
plt. show( )