使用库
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold #数据进行交叉验证的,模型选择时使用
from lightgbm import LGBMClassifier#lightgbm做分类的模型
from lightgbm import LGBMRegressor#lightgbm做回归预测
from sklearn.metrics import f1_score#模型评价指标函数 f1
from sklearn.metrics import mean_squared_error#均方误差
import matplotlib.pyplot as plt
from tqdm import tqdm
import catboost as cbt#模型catboost
from sklearn.cluster import KMeans#聚类算法
from sklearn.preprocessing import LabelEncoder#标签
数据处理
df_train = pd.read_csv('/home/kesci/input/smart_edu7557/exam_score.csv')
df_train = df_train[(df_train['score']!=0)].reset_index(drop=True)#将某特征满足某条件的样本删除
process_index = lambda x: list(x[((x >= (np.percentile(x,25) - 1.5*(np.percentile(x,75)-np.percentile(x,25)))) &
(x <= (np.percentile(x,75) + 1.5*(np.percentile(x,75)-np.percentile(x,25)))))].index)
tmp_process = df_train.groupby(by=['student_id','course'], as_index=False)['score'].agg({
'process_index':process_index})
df_test = pd.read_csv('/home/kesci/input/smart_edu7557/submission_s2.csv')
df_test.rename(columns={
'pred':'score'},inplace = True)
course_class = pd.read_csv('/home/kesci/input/smart_edu7557/course.csv')
student = pd.read_csv('/home/kesci/input/smart_edu7557/student.csv')
all_know = pd.read_csv('/home/kesci/input/smart_edu7557/all_knowledge.csv')
df_all = df_train.append(df_test)
df_all = df_all.merge(course_class, on='course', how='left')
df_all = df_all.merge(student, on='student_id', how='left')
特征工程
增加数据样本的特征维度,找到与目标相关性更强的特征
from sklearn.metrics.pairwise import cosine_distances
course1_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course1_exams.csv')
course2_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course2_exams.csv')
course3_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course3_exams.csv')
course4_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course4_exams.csv')
course5_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course5_exams.csv')
course6_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course6_exams.csv')
course7_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course7_exams.csv')
course8_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course8_exams.csv')
#col_c1 = [i for i in course1_exam.columns if i not in ['course','exam_id']]
tmp4=1
for i in [course1_exam,course2_exam,course3_exam,course4_exam,course5_exam,course6_exam,course7_exam,course8_exam]:
name = i
col_c1 = [i for i in name.columns if i not in ['course','exam_id']]
name['course'] ='course'+str(tmp4)
tmp2 =np.array(all_know.loc[all_know['course'] == ('course'+str(tmp4)),:]['complexity'])
tmp = name[col_c1]
tmp3 =np.dot(tmp.values,tmp2)
name['hard'] = tmp3
name['hard_inverse'] = name['hard'].apply(lambda x:1/(x+1e-10))
tmp4 = tmp4+1
np_tmp = name[col_c1].values
np_tmp= np_tmp.astype(np.bool)
np_tmp2 = np.sum(np_tmp,axis=1)
np_tmp = np.sum(np_tmp,axis=1)/len(col_c1)
np_tmp = (np_tmp-np_tmp.min())/(np_tmp.max()-np_tmp.min())
name['ration_know'] = np_tmp
name['number_know'] = np_tmp2
#添加每个知识点的均分
name['know_mean']=[100/i for i in np_tmp2]
for i in [course1_exam,course2_exam,course3_exam,course4_exam,course5_exam,course6_exam,course7_exam,course8_exam]:
name = i
col_c1 = [i for i in name.columns if i not in ['course','exam_id','hard','hard_inverse','ration_know','number_know']]
pd_ = name[col_c1]
e = 1-cosine_distances(pd_)
e = e -np.diag([1] * len(name))
inde = np.argmax(e,axis=1)
pd_['inde'] = inde
name['sim_exam_id'] = pd_['inde'].apply(lambda x:i.loc[x,'exam_id'])
for key in all_know.groupby(['course','section'])['knowledge_point'].groups.keys():
course = key[0]
section = key[1]
if course == 'course1':
s0_inde = all_know.groupby(['course','section'])['knowledge_point'].groups[key]
tmp = course1_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
course1_exam[section] = tmp.apply(np.sum,axis = 1)
elif course == 'course2':
s0_inde = all_know.groupby(['course','section'])['knowledge_point'].groups[key]
tmp = course2_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
course2_exam[section] = tmp.apply(np.sum,axis = 1)
elif course == 'course3':
s0_inde = all_know.groupby(['course','section'])['knowledge_point'].groups[key]
tmp = course3_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
course3_exam[section] = tmp.apply(np.sum,axis = 1)
elif course == 'course4':
s0_inde = all_know.groupby(['course','section'])['knowledge_point'].groups[key]
tmp = course4_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
course4_exam[section] = tmp.apply(np.sum,axis = 1)
elif course == 'course5':
s0_inde = all_know.groupby(['course','section'])['knowledge_point'].groups[key]
tmp = course5_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
course5_exam[section] = tmp.apply(np.sum,axis = 1)
elif course == 'course6':
s0_inde = all_know.groupby(['course','section'])['knowledge_point'].groups[key]
tmp = course6_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
course6_exam[section] = tmp.apply(np.sum,axis = 1)
elif course == 'course7':
s0_inde = all_know.groupby(['course','section'])['knowledge_point'].groups[key]
tmp = course7_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
course7_exam[section] = tmp.apply(np.sum,axis = 1)
elif course == 'course8':
s0_inde = all_know.groupby(['course','section'])['knowledge_point'].groups[key]
tmp = course8_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
course8_exam[section] = tmp.apply(np.sum,axis = 1)
for key in all_know.groupby(['course','category'])['knowledge_point'].groups.keys():
course = key[0]
section = key[1]
if course == 'course1':
s0_inde = all_know.groupby(['course','category'])['knowledge_point'].groups[key]
tmp = course1_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
course1_exam[section] = tmp.apply(np.sum,axis = 1)
elif course == 'course2':
s0_inde = all_know.groupby(['course','category'])['knowledge_point'].groups[key]
tmp = course2_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
course2_exam[section] = tmp