高校基本信息数据集成-变换 -清洗

最新推荐文章于 2023-01-06 10:30:00 发布
徐尼莫
最新推荐文章于 2023-01-06 10:30:00 发布
阅读量171
点赞数
分类专栏： Python
本文链接：https://blog.csdn.net/qq_34140940/article/details/90040023
版权
Python 专栏收录该内容
6 篇文章 0 订阅
订阅专栏
高校基本信息数据集成-变换 -清洗

# -*- coding: utf-8 -*-
"""
Created on Fri Apr 26 19:39:21 2019

@author: Nemo
"""

#%%
import pandas as pd
import os

pd.set_option('display.float_format', lambda x: '%.3f' % x)
#%%
#招生代码
work_directory1 = r'D:\Nemo\高考分析\\所需数据\学院代码'
work_directory2 = r'D:\Nemo\高考分析\\所需数据\院校批次'
work_directory31 = r'D:\Nemo\高考分析\\所需数据\院校信息\院校信息-地区分类'
work_directory32 = r'D:\Nemo\高考分析\\所需数据\院校信息\院校信息-院校特色（重点建设）'
work_directory4 = r'D:\Nemo\高考分析\\所需数据\招生计划'
work_directory5 = r'D:\Nemo\高考分析\\所需数据'
work_directory6 = r'D:\Nemo\高考分析\\所需数据\学校性质'
work_directory7 = r'D:\Nemo\高考分析\\所需数据\是否新招'
#%%

#获取工作路径下的文件，全路径
def getfilename(wk):
    os.chdir(wk)
    filename = os.listdir(wk)
    for i in range(len(filename)):
        filename[i]=wk+"\\"+filename[i]
    return filename

#获取工作路径下的文件
def getfilename2(wk):
    os.chdir(wk)
    filename = os.listdir(wk)
    return filename
    
    
#%%  collegeCodeResult
#读取数据
filename=getfilename(work_directory1)
#数据获取 collegeCodeResult
collegeCodeResult = pd.DataFrame()
for fname in filename:
    xl = pd.ExcelFile(fname)
    sheet_names=xl.sheet_names  # see all sheet names
    for sheet_name in sheet_names:
        temp=xl.parse(sheet_name)
        collegeCodeResult = pd.concat([collegeCodeResult,temp])
    xl.close()
    
#%% collegeOrderResult
filename=getfilename(work_directory2)
collegeOrderResult = pd.DataFrame()
for fname in filename:
    xl = pd.ExcelFile(fname)
    sheet_names=xl.sheet_names  # see all sheet names
    for sheet_name in sheet_names:
        temp=xl.parse(sheet_name)
        collegeOrderResult = pd.concat([collegeOrderResult,temp])
    xl.close()
        
#%%  collegeZoneResult

filename=getfilename(work_directory31)
collegeZoneResult = pd.DataFrame()
aresult=[]
for fname in filename:
    xl = pd.ExcelFile(fname)
    sheet_names=xl.sheet_names  # see all sheet names
    for sheet_name in sheet_names:
        tmp = str(fname+":"+sheet_name)
        if tmp.find("四川.xls:Sheet1")>= 0:
            continue
        temp=xl.parse(sheet_name)
        atmp=list(temp.columns)
        atmp.append(fname)
        atmp.append(sheet_name)
        aresult.append(atmp)
        temp["院校级别"]=sheet_name
        collegeZoneResult = pd.concat([collegeZoneResult,temp])
    xl.close()
#%%  collegeTypeResult
sum1=0
filename=getfilename2(work_directory32)
collegeTypeResult = pd.DataFrame()
for fname in filename:
    xl = pd.ExcelFile(work_directory32+"\\"+fname)
    sheet_names=xl.sheet_names  # see all sheet names
    for sheet_name in sheet_names:
        tmp = str(fname+":"+sheet_name)
        if tmp.find("四川.xls:Sheet1")>= 0:
            continue
        temp=xl.parse(sheet_name)
        temp["院校级别"]=fname[0:len(fname)-4]
        sum1=sum1+len(temp)
        print(sum1)
        collegeTypeResult = pd.concat([collegeTypeResult,temp])
    xl.close()

##%% collegePlanResult
#filename=getfilename(work_directory4)
#collegePlanResult = pd.DataFrame()
#for fname in filename:
#    xl = pd.ExcelFile(fname)
#    sheet_names=xl.sheet_names  # see all sheet names
#    for sheet_name in sheet_names:
#        temp=xl.parse(sheet_name)
#        collegePlanResult = pd.concat([collegePlanResult,temp])

#%% collegePropertyResult
filename=getfilename(work_directory6)
collegePropertyResult = pd.DataFrame()
for fname in filename:
    xl = pd.ExcelFile(fname)
    sheet_names=xl.sheet_names  # see all sheet names
    for sheet_name in sheet_names:
        temp=xl.parse(sheet_name)
        collegePropertyResult = pd.concat([collegePropertyResult,temp])
    xl.close()
collegePropertyResult=collegePropertyResult[['院校名称', '学院性质']]

#%% collegeNewornotResult
filename=getfilename(work_directory7)
collegeNewornotResult = pd.DataFrame()
for fname in filename:
    xl = pd.ExcelFile(fname)
    sheet_names=xl.sheet_names  # see all sheet names
    for sheet_name in sheet_names:
        temp=xl.parse(sheet_name)
        collegeNewornotResult = pd.concat([collegeNewornotResult,temp])
    xl.close()

#%%
os.chdir(work_directory5)
filename="学校基本信息（文理共用）.xlsx"

collegeBasicinfo = pd.read_excel(filename)

os.chdir(work_directory5)
fileaname= "数据缺失院校（5.6）.xlsx"
missingData = {}
xl = pd.ExcelFile(fileaname)
sheet_names=xl.sheet_names  # see all sheet names
for sheet_name in sheet_names:
    temp=xl.parse(sheet_name)
    if missingData.get(sheet_name)==None:
        missingData[sheet_name]=temp
xl.close()
#%%
#删除重复数据
rankResult=collegeTypeResult[['院校名称', '院校级别']]
zoneResul1t_type = collegeZoneResult[['院校名称', '院校类型']]
zoneResult_provice = collegeZoneResult[['院校名称', '省份']]
orderResult = collegeOrderResult[['院校名称', '批次']]
codeResult = collegeCodeResult[['招生代码', '学校名称']]
collegeBasicinfo  = collegeBasicinfo[['college_name', 'college_url', 'college_phone']]
propertyResult=collegePropertyResult[['院校名称', '学院性质']]

rankResult=rankResult.drop_duplicates()
zoneResul1t_type=zoneResul1t_type.drop_duplicates()
zoneResult_provice=collegeZoneResult.drop_duplicates()
orderResult=orderResult.drop_duplicates()
codeResult=codeResult.drop_duplicates()
propertyResult=propertyResult.drop_duplicates()
collegeBasicinfo=collegeBasicinfo.drop_duplicates()

#%%
os.chdir(work_directory5)
filename = '国内学院（测试数据）.xlsx'
sheename = '数据映射表'
xl = pd.ExcelFile(filename)
collegemapinfo = xl.parse(sheename)
sheename = '数据表'
collegemodelinfo = xl.parse(sheename)
xl.close()

#%%缺失数据修复
#missResult =  missingData.get('录取批次缺失')
#orderResult = pd.concat([orderResult,missResult])

missResult =  missingData.get('录取批次缺失')
orderResult = pd.concat([orderResult,missResult])

missResult =  missingData.get('学校类型缺失')
zoneResul1t_type = pd.concat([zoneResul1t_type,missResult])

missResult =  missingData.get('省份代码缺失')
zoneResult_provice = pd.concat([zoneResult_provice,missResult])

missResult =  missingData.get('招生代码缺失')[['招生代码', '学校名称']]
codeResult = pd.concat([codeResult,missResult])

missResult =  missingData.get('学校性质缺失')[['院校名称', '学院性质']]
propertyResult = pd.concat([propertyResult,missResult])

missResult =  missingData.get('基本信息缺失')
for i in range(len(missResult)):
    t_boolean=collegeBasicinfo["college_name"]==missResult.iloc[i,0]
    tpvalues=missResult.loc[i,['college_name', 'college_url', 'college_phone']].values
    print(str(collegeBasicinfo.shape)+":---:"+str(i))
    if t_boolean.any():
        collegeBasicinfo.loc[t_boolean,['college_name', 'college_url', 'college_phone']]=tpvalues
    else:
        collegeBasicinfo.loc[len(collegeBasicinfo),['college_name', 'college_url', 'college_phone']]=tpvalues
collegeBasicinfo=collegeBasicinfo.drop_duplicates()


#mapping
def get_variable_map(objresult,mappinginfo,varname):
    objresult.index = range(len(objresult))
    mappinginfo['数值']=mappinginfo['数值'].apply(str)
    order=mappinginfo.loc[mappinginfo.类型==varname[0],["数值","说明"]]
    order["说明"] =  order["说明"].astype(str)
    mappingdict = order.set_index('说明').to_dict()['数值']
    
    dictresult={}
    for i in range(len(objresult)):
        tmpkey=objresult.loc[i,"院校名称"]
        tmpvalue=str(objresult.loc[i,varname[1]])
        tmpcode = mappingdict.get(tmpvalue,"") 
        tmpcv=dictresult.get(tmpkey,"")
        if tmpcv =='':
            dictresult[tmpkey]=str(tmpcode)
        else:
            if dictresult.get(tmpkey) == str(tmpcode):
                continue
            else:
                dictresult[tmpkey]=dictresult.get(tmpkey)+","+str(tmpcode)
#        dictresult[tmpkey]=dictresult.get(tmpkey,"")+"|"+tmpvalue
    result= pd.Series(dictresult,index=dictresult.keys())
    result=result.reset_index()
    result.columns=['院校名称',varname[1]]#设置columns
    result=result.loc[result[varname[1]]!='',:]
    return result

#院校级别
varname=["院校级别","院校级别"]
rankResult=get_variable_map(rankResult,collegemapinfo,varname)
#varname=["学校类型","院校类型"]
#typeResult1=get_variable_map(collegeTypeResult,collegemapinfo,varname)
#录取批次
varname=["录取批次","批次"]
orderResult=get_variable_map(orderResult,collegemapinfo,varname)

#区域

varname=["学校类型","院校类型"]
zoneResul1t_type = get_variable_map(zoneResul1t_type,collegemapinfo,varname)


varname=["省份代码","省份"]
zoneResult_provice = get_variable_map(zoneResult_provice,collegemapinfo,varname)
#zoneResult_provice=zoneResult_provice[['院校名称', '省份']]

#code
codeResult = codeResult[['招生代码', '学校名称']]

#plan-newornot
#planResult=collegePlanResult[['院校名称','科类','批次', '2018计划数']]
#running property
propertyResult=propertyResult[['院校名称', '学院性质']]
varname=['学校性质', '学院性质']
propertyResult=get_variable_map(propertyResult,collegemapinfo,varname)

#basicinfo
#collegeBasicinfo=collegeBasicinfo.drop(labels=['college_introduce', 'Admission_Regulations'],axis=1)

#%%merge
#学校名称
#orderResult #录取批次
#propertyResult #学校性质
tmpOP=pd.merge(orderResult,propertyResult,how="outer",left_on=["院校名称"],right_on=["院校名称"])
tmpOP.columns
#codeResult #招生代码
tmpOPC=pd.merge(tmpOP,codeResult,how="outer",left_on=["院校名称"],right_on=["学校名称"])
tmpOPC=tmpOPC.drop(labels=['学校名称'],axis=1)
tmpOPC.columns
#zoneResult #'省份代码'
tmpOPCZ=pd.merge(tmpOPC,zoneResult_provice,how="outer",left_on=["院校名称"],right_on=["院校名称"])
tmpOPCZ.columns
#typeResult1 #学校类型
tmpOPCZT=pd.merge(tmpOPCZ,zoneResul1t_type,how="outer",left_on=["院校名称"],right_on=["院校名称"])
tmpOPCZT.columns
#collegeBasicinfo #'联系方式', '学院网址', 
tmpOPCZTC=pd.merge(tmpOPCZT,collegeBasicinfo,how="outer",left_on=["院校名称"],right_on=["college_name"])
tmpOPCZTC=tmpOPCZTC.drop(labels=['college_name'],axis=1)
tmpOPCZTC.columns
#typeResult #院校级别
tmpOPCZTCT=pd.merge(tmpOPCZTC,rankResult,how="outer",left_on=["院校名称"],right_on=["院校名称"])
tmpOPCZTCT.columns
tmpOPCZTCT.columns=['学校名称', '录取批次', '学校性质', '招生代码', '省份代码', '学校类型', '学院网址','联系方式', '院校级别']
tmpOPCZTCT=tmpOPCZTCT[collegemodelinfo.columns.tolist()]
tmpOPCZTCT=tmpOPCZTCT.drop_duplicates()
tmpOPCZTCT1=tmpOPCZTCT.dropna(axis=0,how='all',subset=['学校名称'])
#%%
#os.chdir(r"D:\Nemo\高考分析\20190425112201\所需数据\result")
#writer = pd.ExcelWriter('高考分析数据集成.xlsx')
#collegeTypeResult.to_excel(writer,'collegeTypeResult')
#collegeOrderResult.to_excel(writer,'collegeOrderResult')
#collegeZoneResult.to_excel(writer,'collegeZoneResult')
#collegeCodeResult.to_excel(writer,'collegeCodeResult')
##collegePlanResult.to_excel(writer,'collegePlanResult')
#collegePropertyResult.to_excel(writer,'collegePropertyResult')
#collegeBasicinfo.to_excel(writer,'collegeBasicinfo')
#writer.save()
#tmpOPCZTCT1['招生代码'] = pd.to_numeric(tmpOPCZTCT1['招生代码'], errors='coerce')
#tmpOPCZTCT1['招生代码'] = tmpOPCZTCT1['招生代码'].astype('Int32')
tmpOPCZTCT1.index=range(len(tmpOPCZTCT1))
tmpOPCZTCT1['招生代码'] = tmpOPCZTCT1['招生代码'].astype(str)
ttmmpp=tmpOPCZTCT1['招生代码']
for i in range(len(ttmmpp)):
    if ttmmpp[i] =='nan':
        continue
    else:
        ttmmpp[i]=str(int(float(ttmmpp[i])))
        len_str=len(ttmmpp[i])
        if len_str<4:
            ttmmpp[i]='0'*(4-len_str)+ttmmpp[i]
tmpOPCZTCT1['招生代码']=ttmmpp

os.chdir(r"D:\Nemo\高考分析\20190425112201\所需数据\result")
writer = pd.ExcelWriter('高考分析数据变换1.xlsx')
tmpOPCZTCT1.to_excel(writer,'数据变换')
writer.save()