上午对慕课学习记录做了预处理
原来的数据样例:
处理代码:
'''
对mooc上课记录数据的预处理
'''
import pandas as pd
import numpy as np
import json
if __name__ == '__main__':
mooc = pd.read_csv('mooc_data.csv', header=0, index_col=None)
course_list = list(set(list(mooc['课程名称'])))
grade_list = list(set(list(mooc['年级'])))
group_list = list(set(list(mooc['分组'])))
course_id_dict = {}
id_course_dict = {}
grade_id_dict = {}
id_grade_dict = {}
group_id_dict = {}
id_group_dict = {}
for i in range(len(course_list)):
course_id_dict[course_list[i]] = i
id_course_dict[i] = course_list[i]
for i in range(len(grade_list)):
grade_id_dict[grade_list[i]] = i
id_grade_dict[i] = grade_list[i]
for i in range(len(group_list)):
group_id_dict[group_list[i]] = i
id_group_dict[i] = group_list[i]
file_dict = {}
file_dict['course_id_dict'] = course_id_dict
file_dict['grade_id_dict'] = grade_id_dict
file_dict['group_id_dict'] = group_id_dict
file_dict['id_course_dict'] = id_course_dict
file_dict['id_grade_dict'] = id_grade_dict
file_dict['id_group_dict'] = id_group_dict
f = open('dictionary.json', 'w')
s = json.dumps(file_dict, ensure_ascii=False)
f.write(s)
f.close()
for i in range(len(mooc)):
group_content = mooc.iloc[i, 2]
mooc.iloc[i, 2] = group_id_dict[group_content]
course_content = mooc.iloc[i, 4]
mooc.iloc[i, 4] = course_id_dict[course_content]
grade_content = mooc.iloc[i, 3]
mooc.iloc[i, 3] = grade_id_dict[grade_content]
del mooc['课程编号']
del mooc['班级']
mooc.to_csv('mooc_data.txt', header=0, index=0)
处理结果:
dictionary.txt
mooc_data.csv