python学习记录（3）——数据分析实战

最新推荐文章于 2024-07-22 14:18:49 发布

weixin_45902527

最新推荐文章于 2024-07-22 14:18:49 发布

阅读量522

点赞数

分类专栏： python学习记录文章标签：数据分析 python 数据挖掘

本文链接：https://blog.csdn.net/weixin_45902527/article/details/122681868

版权

python学习记录专栏收录该内容

5 篇文章 0 订阅

订阅专栏

阿里天池-用Pandas揭秘美国选民的总统喜好
赛事地址https://tianchi.aliyun.com/competition/entrance/531837/forum

import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud,ImageColorGenerator
import os

candidates = pd.read_csv("weball20.txt", sep = '|',names=['CAND_ID','CAND_NAME','CAND_ICI','PTY_CD','CAND_PTY_AFFILIATION','TTL_RECEIPTS',
                                                          'TRANS_FROM_AUTH','TTL_DISB','TRANS_TO_AUTH','COH_BOP','COH_COP','CAND_CONTRIB',
                                                          'CAND_LOANS','OTHER_LOANS','CAND_LOAN_REPAY','OTHER_LOAN_REPAY','DEBTS_OWED_BY',
                                                          'TTL_INDIV_CONTRIB','CAND_OFFICE_ST','CAND_OFFICE_DISTRICT','SPEC_ELECTION','PRIM_ELECTION','RUN_ELECTION'
                                                          ,'GEN_ELECTION','GEN_ELECTION_PRECENT','OTHER_POL_CMTE_CONTRIB','POL_PTY_CONTRIB',
                                                          'CVG_END_DT','INDIV_REFUNDS','CMTE_REFUNDS'])
ccl = pd.read_csv("ccl.txt", sep="|", names=['CAND_ID','CAND_ELECTION_YR','FEC_ELECTION_YR','CMTE_ID','CMTE_TP','CMTE_DSGN','LINKAGE_ID'])

ccl = pd.merge(ccl,candidates)
ccl = pd.DataFrame(ccl,columns=['CMTE_ID','CAND_ID', 'CAND_NAME','CAND_PTY_AFFILIATION'])
ccl.head(20)

itcont = pd.read_csv("itcont_2020_20200722_20200820.txt",sep = '|',names=['CMTE_ID','AMNDT_IND','RPT_TP','TRANSACTION_PGI',
                                                                                  'IMAGE_NUM','TRANSACTION_TP','ENTITY_TP','NAME','CITY',
                                                                                  'STATE','ZIP_CODE','EMPLOYER','OCCUPATION','TRANSACTION_DT',
                                                                                  'TRANSACTION_AMT','OTHER_ID','TRAN_ID','FILE_NUM','MEMO_CD',
                                                                                  'MEMO_TEXT','SUB_ID'])
c_itcont = pd.merge(ccl,itcont)
c_itcont = pd.DataFrame(c_itcont,columns=[ 'CAND_NAME','NAME', 'STATE','EMPLOYER','OCCUPATION',
                                           'TRANSACTION_AMT', 'TRANSACTION_DT','CAND_PTY_AFFILIATION'])
##数据清洗，补充缺失的列，将时间，日期规范化
c_itcont['STATE'].fillna('NOT PROVIDED',inplace=True)
c_itcont['EMPLOYER'].fillna('NOT PROVIDED',inplace=True)
c_itcont['OCCUPATION'].fillna('NOT PROVIDED',inplace=True)

c_itcont['TRANSACTION_DT']=c_itcont['TRANSACTION_DT'].astype(str)
c_itcont['TRANSACTION_DT'] = [i[3:7]+i[0]+i[1:3] for i in c_itcont['TRANSACTION_DT'] ]

##数据分析，捐款总数最多的党派、候选人、职业、州
sum_AFFILIATION = c_itcont.groupby("CAND_PTY_AFFILIATION").sum().sort_values("TRANSACTION_AMT",ascending=False).head(10)

sum_CANDIDATE = c_itcont.groupby("CAND_NAME").sum().sort_values("TRANSACTION_AMT",ascending=False).head(10)

sum_OCCUPATION = c_itcont.groupby("OCCUPATION").sum().sort_values("TRANSACTION_AMT",ascending=False).head(10)

c_itcont['OCCUPATION'].value_counts().head(10)

sum_STATE = c_itcont.groupby("STATE").sum().sort_values("TRANSACTION_AMT",ascending=False).head(5)

c_itcont['STATE'].value_counts().head(5)

##数据可视化
st_amt = c_itcont.groupby('STATE').sum().sort_values("TRANSACTION_AMT",ascending=False)[:10]
st_amt = pd.DataFrame(st_amt,columns=["TRANSACTION_AMT"])
st_amt.plot(kind='bar')

st_amt = c_itcont.groupby('STATE').size().sort_values(ascending=False).head(10)
st_amt.plot(kind='bar')

biden = c_itcont[c_itcont['CAND_NAME']=='BIDEN, JOSEPH R JR']
biden_state = biden.groupby('STATE').sum().sort_values("TRANSACTION_AMT", ascending=False).head(10)
biden_state.plot.pie(figsize=(10, 10),autopct='%0.2f%%',subplots=True)
plt.show()

os.rename('TB10Jx4pBBh1e4jSZFhXXcC9VXa-689-390.jpg','biden.jpg')
data = ' '.join(biden['NAME'].tolist())
bg = plt.imread('biden.jpg')
wc = WordCloud(
    background_color='white',
    width=890,
    height=900,
    mask=bg,
    margin=10,
    max_font_size=100,
    random_state=20,
).generate_from_text(data)
big_color = ImageColorGenerator(bg)
plt.imshow(wc.recolor(color_func=big_color))
plt.axis('off')
wc.to_file("biden_wordcloud.png")

weixin_45902527

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python学习记录（3）——数据分析实战

阿里天池-用Pandas揭秘美国选民的总统喜好赛事地址https://tianchi.aliyun.com/competition/entrance/531837/forumimport pandas as pdimport matplotlib.pyplot as pltfrom wordcloud import WordCloud,ImageColorGeneratorimport oscandidates = pd.read_csv("weball20.txt", sep = '|',n
复制链接

扫一扫