高校基本信息数据集成-变换 -清洗
# -*- coding: utf-8 -*-
"""
Created on Fri Apr 26 19:39:21 2019
@author: Nemo
"""
#%%
import pandas as pd
import os
pd.set_option('display.float_format', lambda x: '%.3f' % x)
#%%
#招生代码
work_directory1 = r'D:\Nemo\高考分析\\所需数据\学院代码'
work_directory2 = r'D:\Nemo\高考分析\\所需数据\院校批次'
work_directory31 = r'D:\Nemo\高考分析\\所需数据\院校信息\院校信息-地区分类'
work_directory32 = r'D:\Nemo\高考分析\\所需数据\院校信息\院校信息-院校特色(重点建设)'
work_directory4 = r'D:\Nemo\高考分析\\所需数据\招生计划'
work_directory5 = r'D:\Nemo\高考分析\\所需数据'
work_directory6 = r'D:\Nemo\高考分析\\所需数据\学校性质'
work_directory7 = r'D:\Nemo\高考分析\\所需数据\是否新招'
#%%
#获取工作路径下的文件,全路径
def getfilename(wk):
os.chdir(wk)
filename = os.listdir(wk)
for i in range(len(filename)):
filename[i]=wk+"\\"+filename[i]
return filename
#获取工作路径下的文件
def getfilename2(wk):
os.chdir(wk)
filename = os.listdir(wk)
return filename
#%% collegeCodeResult
#读取数据
filename=getfilename(work_directory1)
#数据获取 collegeCodeResult
collegeCodeResult = pd.DataFrame()
for fname in filename:
xl = pd.ExcelFile(fname)
sheet_names=xl.sheet_names # see all sheet names
for sheet_name in sheet_names:
temp=xl.parse(sheet_name)
collegeCodeResult = pd.concat([collegeCodeResult,temp])
xl.close()
#%% collegeOrderResult
filename=getfilename(work_directory2)
collegeOrderResult = pd.DataFrame()
for fname in filename:
xl = pd.ExcelFile(fname)
sheet_names=xl.sheet_names # see all sheet names
for sheet_name in sheet_names:
temp=xl.parse(sheet_name)
collegeOrderResult = pd.concat([collegeOrderResult,temp])
xl.close()
#%% collegeZoneResult
filename=getfilename(work_directory31)
collegeZoneResult = pd.DataFrame()
aresult=[]
for fname in filename:
xl = pd.ExcelFile(fname)
sheet_names=xl.sheet_names # see all sheet names
for sheet_name in sheet_names:
tmp = str(fname+":"+sheet_name)
if tmp.find("四川.xls:Sheet1")>= 0:
continue
temp=xl.parse(sheet_name)
atmp=list(temp.columns)
atmp.append(fname)
atmp.append(sheet_name)
aresult.append(atmp)
temp["院校级别"]=sheet_name
collegeZoneResult = pd.concat([collegeZoneResult,temp])
xl.close()
#%% collegeTypeResult
sum1=0
filename=getfilename2(work_directory32)
collegeTypeResult = pd.DataFrame()
for fname in filename:
xl = pd.ExcelFile(work_directory32+"\\"+fname)
sheet_names=xl.sheet_names # see all sheet names
for sheet_name in sheet_names:
tmp = str(fname+":"+sheet_name)
if tmp.find("四川.xls:Sheet1")>= 0:
continue
temp=xl.parse(sheet_name)
temp["院校级别"]=fname[0:len(fname)-4]
sum1=sum1+len(temp)
print(sum1)
collegeTypeResult = pd.concat([collegeTypeResult,temp])
xl.close()
##%% collegePlanResult
#filename=getfilename(work_directory4)
#collegePlanResult = pd.DataFrame()
#for fname in filename:
# xl = pd.ExcelFile(fname)
# sheet_names=xl.sheet_names # see all sheet names
# for sheet_name in sheet_names:
# temp=xl.parse(sheet_name)
# collegePlanResult = pd.concat([collegePlanResult,temp])
#%% collegePropertyResult
filename=getfilename(work_directory6)
collegePropertyResult = pd.DataFrame()
for fname in filename:
xl = pd.ExcelFile(fname)
sheet_names=xl.sheet_names # see all sheet names
for sheet_name in sheet_names:
temp=xl.parse(sheet_name)
collegePropertyResult = pd.concat([collegePropertyResult,temp])
xl.close()
collegePropertyResult=collegePropertyResult[['院校名称', '学院性质']]
#%% collegeNewornotResult
filename=getfilename(work_directory7)
collegeNewornotResult = pd.DataFrame()
for fname in filename:
xl = pd.ExcelFile(fname)
sheet_names=xl.sheet_names # see all sheet names
for sheet_name in sheet_names:
temp=xl.parse(sheet_name)
collegeNewornotResult = pd.concat([collegeNewornotResult,temp])
xl.close()
#%%
os.chdir(work_directory5)
filename="学校基本信息(文理共用).xlsx"
collegeBasicinfo = pd.read_excel(filename)
os.chdir(work_directory5)
fileaname= "数据缺失院校(5.6).xlsx"
missingData = {}
xl = pd.ExcelFile(fileaname)
sheet_names=xl.sheet_names # see all sheet names
for sheet_name in sheet_names:
temp=xl.parse(sheet_name)
if missingData.get(sheet_name)==None:
missingData[sheet_name]=temp
xl.close()
#%%
#删除重复数据
rankResult=collegeTypeResult[['院校名称', '院校级别']]
zoneResul1t_type = collegeZoneResult[['院校名称', '院校类型']]
zoneResult_provice = collegeZoneResult[['院校名称', '省份']]
orderResult = collegeOrderResult[['院校名称', '批次']]
codeResult = collegeCodeResult[['招生代码', '学校名称']]
collegeBasicinfo = collegeBasicinfo[['college_name', 'college_url', 'college_phone']]
propertyResult=collegePropertyResult[['院校名称', '学院性质']]
rankResult=rankResult.drop_duplicates()
zoneResul1t_type=zoneResul1t_type.drop_duplicates()
zoneResult_provice=collegeZoneResult.drop_duplicates()
orderResult=orderResult.drop_duplicates()
codeResult=codeResult.drop_duplicates()
propertyResult=propertyResult.drop_duplicates()
collegeBasicinfo=collegeBasicinfo.drop_duplicates()
#%%
os.chdir(work_directory5)
filename = '国内学院(测试数据).xlsx'
sheename = '数据映射表'
xl = pd.ExcelFile(filename)
collegemapinfo = xl.parse(sheename)
sheename = '数据表'
collegemodelinfo = xl.parse(sheename)
xl.close()
#%%缺失数据修复
#missResult = missingData.get('录取批次缺失')
#orderResult = pd.concat([orderResult,missResult])
missResult = missingData.get('录取批次缺失')
orderResult = pd.concat([orderResult,missResult])
missResult = missingData.get('学校类型缺失')
zoneResul1t_type = pd.concat([zoneResul1t_type,missResult])
missResult = missingData.get('省份代码缺失')
zoneResult_provice = pd.concat([zoneResult_provice,missResult])
missResult = missingData.get('招生代码缺失')[['招生代码', '学校名称']]
codeResult = pd.concat([codeResult,missResult])
missResult = missingData.get('学校性质缺失')[['院校名称', '学院性质']]
propertyResult = pd.concat([propertyResult,missResult])
missResult = missingData.get('基本信息缺失')
for i in range(len(missResult)):
t_boolean=collegeBasicinfo["college_name"]==missResult.iloc[i,0]
tpvalues=missResult.loc[i,['college_name', 'college_url', 'college_phone']].values
print(str(collegeBasicinfo.shape)+":---:"+str(i))
if t_boolean.any():
collegeBasicinfo.loc[t_boolean,['college_name', 'college_url', 'college_phone']]=tpvalues
else:
collegeBasicinfo.loc[len(collegeBasicinfo),['college_name', 'college_url', 'college_phone']]=tpvalues
collegeBasicinfo=collegeBasicinfo.drop_duplicates()
#mapping
def get_variable_map(objresult,mappinginfo,varname):
objresult.index = range(len(objresult))
mappinginfo['数值']=mappinginfo['数值'].apply(str)
order=mappinginfo.loc[mappinginfo.类型==varname[0],["数值","说明"]]
order["说明"] = order["说明"].astype(str)
mappingdict = order.set_index('说明').to_dict()['数值']
dictresult={}
for i in range(len(objresult)):
tmpkey=objresult.loc[i,"院校名称"]
tmpvalue=str(objresult.loc[i,varname[1]])
tmpcode = mappingdict.get(tmpvalue,"")
tmpcv=dictresult.get(tmpkey,"")
if tmpcv =='':
dictresult[tmpkey]=str(tmpcode)
else:
if dictresult.get(tmpkey) == str(tmpcode):
continue
else:
dictresult[tmpkey]=dictresult.get(tmpkey)+","+str(tmpcode)
# dictresult[tmpkey]=dictresult.get(tmpkey,"")+"|"+tmpvalue
result= pd.Series(dictresult,index=dictresult.keys())
result=result.reset_index()
result.columns=['院校名称',varname[1]]#设置columns
result=result.loc[result[varname[1]]!='',:]
return result
#院校级别
varname=["院校级别","院校级别"]
rankResult=get_variable_map(rankResult,collegemapinfo,varname)
#varname=["学校类型","院校类型"]
#typeResult1=get_variable_map(collegeTypeResult,collegemapinfo,varname)
#录取批次
varname=["录取批次","批次"]
orderResult=get_variable_map(orderResult,collegemapinfo,varname)
#区域
varname=["学校类型","院校类型"]
zoneResul1t_type = get_variable_map(zoneResul1t_type,collegemapinfo,varname)
varname=["省份代码","省份"]
zoneResult_provice = get_variable_map(zoneResult_provice,collegemapinfo,varname)
#zoneResult_provice=zoneResult_provice[['院校名称', '省份']]
#code
codeResult = codeResult[['招生代码', '学校名称']]
#plan-newornot
#planResult=collegePlanResult[['院校名称','科类','批次', '2018计划数']]
#running property
propertyResult=propertyResult[['院校名称', '学院性质']]
varname=['学校性质', '学院性质']
propertyResult=get_variable_map(propertyResult,collegemapinfo,varname)
#basicinfo
#collegeBasicinfo=collegeBasicinfo.drop(labels=['college_introduce', 'Admission_Regulations'],axis=1)
#%%merge
#学校名称
#orderResult #录取批次
#propertyResult #学校性质
tmpOP=pd.merge(orderResult,propertyResult,how="outer",left_on=["院校名称"],right_on=["院校名称"])
tmpOP.columns
#codeResult #招生代码
tmpOPC=pd.merge(tmpOP,codeResult,how="outer",left_on=["院校名称"],right_on=["学校名称"])
tmpOPC=tmpOPC.drop(labels=['学校名称'],axis=1)
tmpOPC.columns
#zoneResult #'省份代码'
tmpOPCZ=pd.merge(tmpOPC,zoneResult_provice,how="outer",left_on=["院校名称"],right_on=["院校名称"])
tmpOPCZ.columns
#typeResult1 #学校类型
tmpOPCZT=pd.merge(tmpOPCZ,zoneResul1t_type,how="outer",left_on=["院校名称"],right_on=["院校名称"])
tmpOPCZT.columns
#collegeBasicinfo #'联系方式', '学院网址',
tmpOPCZTC=pd.merge(tmpOPCZT,collegeBasicinfo,how="outer",left_on=["院校名称"],right_on=["college_name"])
tmpOPCZTC=tmpOPCZTC.drop(labels=['college_name'],axis=1)
tmpOPCZTC.columns
#typeResult #院校级别
tmpOPCZTCT=pd.merge(tmpOPCZTC,rankResult,how="outer",left_on=["院校名称"],right_on=["院校名称"])
tmpOPCZTCT.columns
tmpOPCZTCT.columns=['学校名称', '录取批次', '学校性质', '招生代码', '省份代码', '学校类型', '学院网址','联系方式', '院校级别']
tmpOPCZTCT=tmpOPCZTCT[collegemodelinfo.columns.tolist()]
tmpOPCZTCT=tmpOPCZTCT.drop_duplicates()
tmpOPCZTCT1=tmpOPCZTCT.dropna(axis=0,how='all',subset=['学校名称'])
#%%
#os.chdir(r"D:\Nemo\高考分析\20190425112201\所需数据\result")
#writer = pd.ExcelWriter('高考分析数据集成.xlsx')
#collegeTypeResult.to_excel(writer,'collegeTypeResult')
#collegeOrderResult.to_excel(writer,'collegeOrderResult')
#collegeZoneResult.to_excel(writer,'collegeZoneResult')
#collegeCodeResult.to_excel(writer,'collegeCodeResult')
##collegePlanResult.to_excel(writer,'collegePlanResult')
#collegePropertyResult.to_excel(writer,'collegePropertyResult')
#collegeBasicinfo.to_excel(writer,'collegeBasicinfo')
#writer.save()
#tmpOPCZTCT1['招生代码'] = pd.to_numeric(tmpOPCZTCT1['招生代码'], errors='coerce')
#tmpOPCZTCT1['招生代码'] = tmpOPCZTCT1['招生代码'].astype('Int32')
tmpOPCZTCT1.index=range(len(tmpOPCZTCT1))
tmpOPCZTCT1['招生代码'] = tmpOPCZTCT1['招生代码'].astype(str)
ttmmpp=tmpOPCZTCT1['招生代码']
for i in range(len(ttmmpp)):
if ttmmpp[i] =='nan':
continue
else:
ttmmpp[i]=str(int(float(ttmmpp[i])))
len_str=len(ttmmpp[i])
if len_str<4:
ttmmpp[i]='0'*(4-len_str)+ttmmpp[i]
tmpOPCZTCT1['招生代码']=ttmmpp
os.chdir(r"D:\Nemo\高考分析\20190425112201\所需数据\result")
writer = pd.ExcelWriter('高考分析数据变换1.xlsx')
tmpOPCZTCT1.to_excel(writer,'数据变换')
writer.save()