#导入相关包
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#读取候选人信息,原始数据没有表头,要添加表头
data1 = pd.read_csv('weball20.txt',sep = '|',names=['CAND_ID','CAND_NAME','CAND_ICI','PTY_CD','CAND_PTY_AFFILIATION','TTL_RECEIPTS',
'TRANS_FROM_AUTH','TTL_DISB','TRANS_TO_AUTH','COH_BOP','COH_COP','CAND_CONTRIB',
'CAND_LOANS','OTHER_LOANS','CAND_LOAN_REPAY','OTHER_LOAN_REPAY','DEBTS_OWED_BY',
'TTL_INDIV_CONTRIB','CAND_OFFICE_ST','CAND_OFFICE_DISTRICT','SPEC_ELECTION','PRIM_ELECTION','RUN_ELECTION'
,'GEN_ELECTION','GEN_ELECTION_PRECENT','OTHER_POL_CMTE_CONTRIB','POL_PTY_CONTRIB',
'CVG_END_DT','INDIV_REFUNDS','CMTE_REFUNDS'])
#读取候选人和委员会的关系
data2 = pd.read_csv('ccl.txt',sep = '|',names=['CAND_ID','CAND_ELECTION_YR','FEC_ELECTION_YR','CMTE_ID','CMTE_TP','CMTE_DSGN','LINKAGE_ID'])
#通过委员会id,CAND_ID关联data1和data2,就是将data2多出的列加到data1中,合并两张表
data3 = pd.merge(data1,data2)
#抽取出需要的列
#cmte_id 委员会鉴定
#cand_id 候选人身份证明
#cand_name 候选人名称
#CAND_PTY_AFFILIATION 党派关系
data4 = pd.DataFrame(data3,columns=[ 'CMTE_ID','CAND_ID', 'CAND_NAME','CAND_PTY_AFFILIATION'])
#读取个人捐赠表,添加表头
data5 = pd.read_csv('itcont.txt', sep='|',names=['CMTE_ID','AMNDT_IND','RPT_TP','TRANSACTION_PGI',
'IMAGE_NUM','TRANSACTION_TP','ENTITY_TP','NAME','CITY',
'STATE','ZIP_CODE','EMPLOYER','OCCUPATION','TRANSACTION_DT',
'TRANSACTION_AMT','OTHER_ID','TRAN_ID','FILE_NUM','MEMO_CD',
'MEMO_TEXT','SUB_ID'])
#通过cmte_id,关联data4和data5
data6 = pd.merge(data4,data5)
#提取需要的列
#cand_name 候选人名称
#name 出资人
#state 状态
#employer 雇主
#OCCUPATION
data7 = pd.DataFrame(data6,columns=[ 'CAND_NAME','NAME', 'STATE','EMPLOYER','OCCUPATION',
'TRANSACTION_AMT', 'TRANSACTION_DT','CAND_PTY_AFFILIATION'])
#空值处理,统一填充 NOT PROVIDED
data7['STATE'].fillna('NOT PROVIDED',inplace=True)
data7['EMPLOYER'].fillna('NOT PROVIDED',inplace=True)
data7['OCCUPATION'].fillna('NOT PROVIDED',inplace=True)
# 对日期TRANSACTION_DT列进行处理
data7['TRANSACTION_DT'] = c_itcont['TRANSACTION_DT'] .astype(str)
# 将日期格式改为年月日 7242020
data7['TRANSACTION_DT'] = [i[3:7]+i[0]+i[1:3] for i in c_itcont['TRANSACTION_DT'] ]
# 从所有数据中取出支持拜的数据
biden = data7[data7['CAND_NAME']=='BIDEN, JOSEPH R JR']
# 统计各州对拜的捐款总数
biden_state = biden.groupby('STATE').sum().sort_values("TRANSACTION_AMT", ascending=False)
q=np.array(biden_state) #创建ndarray 对象
number_b=len(q) #支持拜州的数量
# 从所有数据中取出支持特朗的数据
trump = data7[data7['CAND_NAME']=='TRUMP, DONALD J.']
# 统计各州对特朗的捐款总数
trump_state = trump.groupby('STATE').sum().sort_values("TRANSACTION_AMT", ascending=False)
w=np.array(trump_state) #创建ndarray 对象
number_t=len(w) #支持特朗州的数量
#pivot_table作用是按职业和捐赠数额聚合数据,index=['STATE']取出捐赠人所在的职业,values=['TRANSACTION_AMT']取出捐赠人捐赠的数额,aggfunc='sum'计算相同州的总额
by_occupation=pd.pivot_table(data7,index=['STATE'],values=['TRANSACTION_AMT'],aggfunc='sum')
number=len(by_occupation) #州的数量
# 从所有数据中取出支持拜的数据
biden = data7[data7['CAND_NAME']=='BIDEN, JOSEPH R JR']
# 统计各职业对拜的捐款总数
biden_state = biden.groupby('OCCUPATION').sum().sort_values("TRANSACTION_AMT", ascending=False)
e=np.array(biden_state) #创建ndarray 对象
# 从所有数据中取出支持特朗的数据
biden = data7[data7['CAND_NAME']=='TRUMP, DONALD J.']
# 统计各职业对特朗的捐款总数
biden_state = biden.groupby('OCCUPATION').sum().sort_values("TRANSACTION_AMT", ascending=False)
r=np.array(biden_state) #创建ndarray 对象
#pivot_table作用是按职业和捐赠数额聚合数据,index=['OCCUPATION']取出捐赠人所在的职业,values=['TRANSACTION_AMT']取出捐赠人捐赠的数额,aggfunc='sum'计算相同职业的总额
by_occupation=pd.pivot_table(c_itcont,index=['OCCUPATION'],values=['TRANSACTION_AMT'],aggfunc='sum')
number_j=len(by_occupation) #职业的总数量
print('州的总数:{}'.format(number))
print('支持BIDEN, JOSEPH R JR州的数量:{}'.format(number_b))
print('支持特朗州的数量:{}'.format(number_t))
print('职业的总数:{}'.format(number_j))
print('支持BIDEN, JOSEPH R JR的数量:{}'.format(len(e)))
print('支持特朗的数量:{}'.format(len(r)))
python 数据分析入门实战
最新推荐文章于 2022-12-12 18:12:46 发布