json数据-高考录取数据分析

json数据-高考招考计划及录取数据处理

#导入模块
import pandas as pd
import os
import re

#设置工作路径
work_directory = r'D:\Nemo\高考分析\data'
os.chdir(work_directory)
file_name=r'理科历史数据.csv'
science_data=pd.read_csv(file_name,index_col = None, header = 0, engine = 'python', encoding = 'gbk')

file_name=r'文科历史数据.csv'
art_data=pd.read_csv(file_name,index_col = None, header = 0, engine = 'python', encoding = 'gbk')

#数据清洗
def replace_single_quote(object_data):
    temp1=object_data["college_AdmissionScore"]
    temp2=object_data["Admission_detail"]
    pattern01 = re.compile(r"'")
    pattern02 = re.compile(r", ]")
    pattern03 = re.compile("\["+", ")
    pattern04 = re.compile(", ,")
    pattern05 = re.compile("\n")
    for i in range(len(temp1)):
        temp1[i]=str(temp1[i])
        temp2[i]=str(temp2[i])
        if not temp1[i]=='nan':
            temp1[i]=re.sub(pattern01,"",temp1[i])
        if not temp2[i]=='nan':
            temp2[i]=re.sub(pattern01,"",temp2[i])
        temp1[i]=re.sub(pattern02,"]",temp1[i])
        temp2[i]=re.sub(pattern02,"]",temp2[i])
        temp1[i]=re.sub(pattern03,"[",temp1[i])
        temp2[i]=re.sub(pattern03,"[",temp2[i])
        temp1[i]=re.sub(pattern04,",",temp1[i])
        temp2[i]=re.sub(pattern04,",",temp2[i])
        temp1[i]=re.sub(pattern05,"",temp1[i])
        temp2[i]=re.sub(pattern05,"",temp2[i])
    return object_data

#部分字段转为dataframe
def josn2df(object_data):
    result = pd.DataFrame()
    for i in range(object_data.shape[0]):
        print("now is running row %s"%i+" !")
        if (object_data.iloc[i,3] == 'nan' or object_data.iloc[i,2]== 'nan' or object_data.iloc[i,1] == 'nan'):
            continue
        col3=pd.read_json(object_data.iloc[i,3], orient='records')
        col2=pd.read_json(object_data.iloc[i,2], orient='records')
        col1=pd.read_json("["+object_data.iloc[i,1]+"]", orient='records')
        if col3.size ==0:
            col23 = col2
        if col2.size ==0:
            col23 = col3
        if col2.size ==0 and col3.size ==0:
            continue
        if not(col2.size ==0 or col3.size ==0):
            col23=pd.merge(col3,col2,how="outer",left_on=["historyProYear","collegeHistoryID"],right_on=["matricDiffYear","collegeHistoryID"])
        col123=pd.merge(col1,col23,how="outer",right_on=["collegeHistoryID"],left_on=["collegeHistoryId"])
        result = pd.concat([result,col123])
    result.reset_index=None
    return result

#数据清洗
art_data=replace_single_quote(art_data)
science_data=replace_single_quote(science_data)

#内容为json的dataframe解析为dataframe
art_data_df = josn2df(art_data)
science_data_df = josn2df(science_data)

#写为excel
writer = pd.ExcelWriter('高考分析历史数据.xlsx')
art_data_df.to_excel(writer,'文科')
science_data_df.to_excel(writer,'理科')
writer.save()


#读取json文件数据,并转为dataframe
def jsonfile2dataframe(filename):
    data_str = open(filename,encoding='utf-8').read()
    data_str = "["+data_str+"]"
    result =pd.read_json(data_str, orient='records')
    return result

#读取json文件
science_CollegePlan_df = jsonfile2dataframe("CollegePlan理科json.json")
art_CollegePlan_df = jsonfile2dataframe("CollegePlan文科json.json")

#写为excel
writer = pd.ExcelWriter('高考分析CollegePlan.xlsx')
art_CollegePlan_df.to_excel(writer,'CollegePlan文科')
science_CollegePlan_df.to_excel(writer,'CollegePlan理科')
writer.save()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值