阿里天池-用Pandas揭秘美国选民的总统喜好
赛事地址https://tianchi.aliyun.com/competition/entrance/531837/forum
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud,ImageColorGenerator
import os
candidates = pd.read_csv("weball20.txt", sep = '|',names=['CAND_ID','CAND_NAME','CAND_ICI','PTY_CD','CAND_PTY_AFFILIATION','TTL_RECEIPTS',
'TRANS_FROM_AUTH','TTL_DISB','TRANS_TO_AUTH','COH_BOP','COH_COP','CAND_CONTRIB',
'CAND_LOANS','OTHER_LOANS','CAND_LOAN_REPAY','OTHER_LOAN_REPAY','DEBTS_OWED_BY',
'TTL_INDIV_CONTRIB','CAND_OFFICE_ST','CAND_OFFICE_DISTRICT','SPEC_ELECTION','PRIM_ELECTION','RUN_ELECTION'
,'GEN_ELECTION','GEN_ELECTION_PRECENT','OTHER_POL_CMTE_CONTRIB','POL_PTY_CONTRIB',
'CVG_END_DT','INDIV_REFUNDS','CMTE_REFUNDS'])
ccl = pd.read_csv("ccl.txt", sep="|", names=['CAND_ID','CAND_ELECTION_YR','FEC_ELECTION_YR','CMTE_ID','CMTE_TP','CMTE_DSGN','LINKAGE_ID'])
ccl = pd.merge(ccl,candidates)
ccl = pd.DataFrame(ccl,columns=['CMTE_ID','CAND_ID', 'CAND_NAME','CAND_PTY_AFFILIATION'])
ccl.head(20)
itcont = pd.read_csv("itcont_2020_20200722_20200820.txt",sep = '|',names=['CMTE_ID','AMNDT_IND','RPT_TP','TRANSACTION_PGI',
'IMAGE_NUM','TRANSACTION_TP','ENTITY_TP','NAME','CITY',
'STATE','ZIP_CODE','EMPLOYER','OCCUPATION','TRANSACTION_DT',
'TRANSACTION_AMT','OTHER_ID','TRAN_ID','FILE_NUM','MEMO_CD',
'MEMO_TEXT','SUB_ID'])
c_itcont = pd.merge(ccl,itcont)
c_itcont = pd.DataFrame(c_itcont,columns=[ 'CAND_NAME','NAME', 'STATE','EMPLOYER','OCCUPATION',
'TRANSACTION_AMT', 'TRANSACTION_DT','CAND_PTY_AFFILIATION'])
##数据清洗,补充缺失的列,将时间,日期规范化
c_itcont['STATE'].fillna('NOT PROVIDED',inplace=True)
c_itcont['EMPLOYER'].fillna('NOT PROVIDED',inplace=True)
c_itcont['OCCUPATION'].fillna('NOT PROVIDED',inplace=True)
c_itcont['TRANSACTION_DT']=c_itcont['TRANSACTION_DT'].astype(str)
c_itcont['TRANSACTION_DT'] = [i[3:7]+i[0]+i[1:3] for i in c_itcont['TRANSACTION_DT'] ]
##数据分析,捐款总数最多的党派、候选人、职业、州
sum_AFFILIATION = c_itcont.groupby("CAND_PTY_AFFILIATION").sum().sort_values("TRANSACTION_AMT",ascending=False).head(10)
sum_CANDIDATE = c_itcont.groupby("CAND_NAME").sum().sort_values("TRANSACTION_AMT",ascending=False).head(10)
sum_OCCUPATION = c_itcont.groupby("OCCUPATION").sum().sort_values("TRANSACTION_AMT",ascending=False).head(10)
c_itcont['OCCUPATION'].value_counts().head(10)
sum_STATE = c_itcont.groupby("STATE").sum().sort_values("TRANSACTION_AMT",ascending=False).head(5)
c_itcont['STATE'].value_counts().head(5)
##数据可视化
st_amt = c_itcont.groupby('STATE').sum().sort_values("TRANSACTION_AMT",ascending=False)[:10]
st_amt = pd.DataFrame(st_amt,columns=["TRANSACTION_AMT"])
st_amt.plot(kind='bar')
st_amt = c_itcont.groupby('STATE').size().sort_values(ascending=False).head(10)
st_amt.plot(kind='bar')
biden = c_itcont[c_itcont['CAND_NAME']=='BIDEN, JOSEPH R JR']
biden_state = biden.groupby('STATE').sum().sort_values("TRANSACTION_AMT", ascending=False).head(10)
biden_state.plot.pie(figsize=(10, 10),autopct='%0.2f%%',subplots=True)
plt.show()
os.rename('TB10Jx4pBBh1e4jSZFhXXcC9VXa-689-390.jpg','biden.jpg')
data = ' '.join(biden['NAME'].tolist())
bg = plt.imread('biden.jpg')
wc = WordCloud(
background_color='white',
width=890,
height=900,
mask=bg,
margin=10,
max_font_size=100,
random_state=20,
).generate_from_text(data)
big_color = ImageColorGenerator(bg)
plt.imshow(wc.recolor(color_func=big_color))
plt.axis('off')
wc.to_file("biden_wordcloud.png")