json数据-高考招考计划及录取数据处理
#导入模块
import pandas as pd
import os
import re
#设置工作路径
work_directory = r'D:\Nemo\高考分析\data'
os.chdir(work_directory)
file_name=r'理科历史数据.csv'
science_data=pd.read_csv(file_name,index_col = None, header = 0, engine = 'python', encoding = 'gbk')
file_name=r'文科历史数据.csv'
art_data=pd.read_csv(file_name,index_col = None, header = 0, engine = 'python', encoding = 'gbk')
#数据清洗
def replace_single_quote(object_data):
temp1=object_data["college_AdmissionScore"]
temp2=object_data["Admission_detail"]
pattern01 = re.compile(r"'")
pattern02 = re.compile(r", ]")
pattern03 = re.compile("\["+", ")
pattern04 = re.compile(", ,")
pattern05 = re.compile("\n")
for i in range(len(temp1)):
temp1[i]=str(temp1[i])
temp2[i]=str(temp2[i])
if not temp1[i]=='nan':
temp1[i]=re.sub(pattern01,"",temp1[i])
if not temp2[i]=='nan':
temp2[i]=re.sub(pattern01,"",temp2[i])
temp1[i]=re.sub(pattern02,"]",temp1[i])
temp2[i]=re.sub(pattern02,"]",temp2[i])
temp1[i]=re.sub(pattern03,"[",temp1[i])
temp2[i]=re.sub(pattern03,"[",temp2[i])
temp1[i]=re.sub(pattern04,",",temp1[i])
temp2[i]=re.sub(pattern04,",",temp2[i])
temp1[i]=re.sub(pattern05,"",temp1[i])
temp2[i]=re.sub(pattern05,"",temp2[i])
return object_data
#部分字段转为dataframe
def josn2df(object_data):
result = pd.DataFrame()
for i in range(object_data.shape[0]):
print("now is running row %s"%i+" !")
if (object_data.iloc[i,3] == 'nan' or object_data.iloc[i,2]== 'nan' or object_data.iloc[i,1] == 'nan'):
continue
col3=pd.read_json(object_data.iloc[i,3], orient='records')
col2=pd.read_json(object_data.iloc[i,2], orient='records')
col1=pd.read_json("["+object_data.iloc[i,1]+"]", orient='records')
if col3.size ==0:
col23 = col2
if col2.size ==0:
col23 = col3
if col2.size ==0 and col3.size ==0:
continue
if not(col2.size ==0 or col3.size ==0):
col23=pd.merge(col3,col2,how="outer",left_on=["historyProYear","collegeHistoryID"],right_on=["matricDiffYear","collegeHistoryID"])
col123=pd.merge(col1,col23,how="outer",right_on=["collegeHistoryID"],left_on=["collegeHistoryId"])
result = pd.concat([result,col123])
result.reset_index=None
return result
#数据清洗
art_data=replace_single_quote(art_data)
science_data=replace_single_quote(science_data)
#内容为json的dataframe解析为dataframe
art_data_df = josn2df(art_data)
science_data_df = josn2df(science_data)
#写为excel
writer = pd.ExcelWriter('高考分析历史数据.xlsx')
art_data_df.to_excel(writer,'文科')
science_data_df.to_excel(writer,'理科')
writer.save()
#读取json文件数据,并转为dataframe
def jsonfile2dataframe(filename):
data_str = open(filename,encoding='utf-8').read()
data_str = "["+data_str+"]"
result =pd.read_json(data_str, orient='records')
return result
#读取json文件
science_CollegePlan_df = jsonfile2dataframe("CollegePlan理科json.json")
art_CollegePlan_df = jsonfile2dataframe("CollegePlan文科json.json")
#写为excel
writer = pd.ExcelWriter('高考分析CollegePlan.xlsx')
art_CollegePlan_df.to_excel(writer,'CollegePlan文科')
science_CollegePlan_df.to_excel(writer,'CollegePlan理科')
writer.save()