数据比赛代码记录

数据比赛里用写的代码,可快速应用到其他比赛

使用库

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold #数据进行交叉验证的,模型选择时使用
from lightgbm import LGBMClassifier#lightgbm做分类的模型
from lightgbm import LGBMRegressor#lightgbm做回归预测
from sklearn.metrics import f1_score#模型评价指标函数 f1
from sklearn.metrics import mean_squared_error#均方误差
import matplotlib.pyplot as plt
from tqdm import tqdm
import catboost as cbt#模型catboost
from sklearn.cluster import KMeans#聚类算法
from sklearn.preprocessing import LabelEncoder#标签

数据处理

df_train = pd.read_csv('/home/kesci/input/smart_edu7557/exam_score.csv')
df_train = df_train[(df_train['score']!=0)].reset_index(drop=True)#将某特征满足某条件的样本删除
process_index = lambda x: list(x[((x >= (np.percentile(x,25) - 1.5*(np.percentile(x,75)-np.percentile(x,25)))) & 
(x <= (np.percentile(x,75) + 1.5*(np.percentile(x,75)-np.percentile(x,25)))))].index)
tmp_process = df_train.groupby(by=['student_id','course'], as_index=False)['score'].agg({
   'process_index':process_index})
df_test = pd.read_csv('/home/kesci/input/smart_edu7557/submission_s2.csv')
df_test.rename(columns={
   'pred':'score'},inplace = True)
course_class = pd.read_csv('/home/kesci/input/smart_edu7557/course.csv')
student = pd.read_csv('/home/kesci/input/smart_edu7557/student.csv')
all_know = pd.read_csv('/home/kesci/input/smart_edu7557/all_knowledge.csv')
df_all = df_train.append(df_test)
df_all = df_all.merge(course_class, on='course', how='left')
df_all = df_all.merge(student, on='student_id', how='left')

特征工程

增加数据样本的特征维度,找到与目标相关性更强的特征

from sklearn.metrics.pairwise import cosine_distances
course1_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course1_exams.csv')
course2_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course2_exams.csv')
course3_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course3_exams.csv')
course4_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course4_exams.csv')
course5_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course5_exams.csv')
course6_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course6_exams.csv')
course7_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course7_exams.csv')
course8_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course8_exams.csv')
#col_c1 = [i for i in course1_exam.columns if i not in ['course','exam_id']]
tmp4=1
for i in [course1_exam,course2_exam,course3_exam,course4_exam,course5_exam,course6_exam,course7_exam,course8_exam]:
    name = i
    col_c1 = [i for i in name.columns if i not in ['course','exam_id']]
    name['course'] ='course'+str(tmp4)
    tmp2 =np.array(all_know.loc[all_know['course'] == ('course'+str(tmp4)),:]['complexity'])
    tmp = name[col_c1]
    tmp3 =np.dot(tmp.values,tmp2)
    name['hard'] = tmp3
    name['hard_inverse'] = name['hard'].apply(lambda x:1/(x+1e-10))
    tmp4 = tmp4+1
    
    np_tmp = name[col_c1].values
    np_tmp= np_tmp.astype(np.bool)
    np_tmp2 = np.sum(np_tmp,axis=1)
    np_tmp = np.sum(np_tmp,axis=1)/len(col_c1)
    np_tmp = (np_tmp-np_tmp.min())/(np_tmp.max()-np_tmp.min())
    name['ration_know'] = np_tmp
    name['number_know'] = np_tmp2
    #添加每个知识点的均分
    name['know_mean']=[100/i for i in np_tmp2]
    
for i in [course1_exam,course2_exam,course3_exam,course4_exam,course5_exam,course6_exam,course7_exam,course8_exam]:
    name = i
    col_c1 = [i for i in name.columns if i not in ['course','exam_id','hard','hard_inverse','ration_know','number_know']]
    pd_ = name[col_c1]
    e = 1-cosine_distances(pd_)
    e = e -np.diag([1] * len(name))
    inde = np.argmax(e,axis=1)
    pd_['inde'] = inde
    name['sim_exam_id'] = pd_['inde'].apply(lambda x:i.loc[x,'exam_id'])
    
for key in all_know.groupby(['course','section'])['knowledge_point'].groups.keys():
    course = key[0]
    section = key[1]
    if course == 'course1':
        s0_inde = all_know.groupby(['course','section'])['knowledge_point'].groups[key]
        tmp = course1_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course1_exam[section] = tmp.apply(np.sum,axis = 1)
    elif course == 'course2':
        s0_inde = all_know.groupby(['course','section'])['knowledge_point'].groups[key]
        tmp = course2_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course2_exam[section] = tmp.apply(np.sum,axis = 1)
    elif course == 'course3':
        s0_inde = all_know.groupby(['course','section'])['knowledge_point'].groups[key]
        tmp = course3_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course3_exam[section] = tmp.apply(np.sum,axis = 1)
    elif course == 'course4':
        s0_inde = all_know.groupby(['course','section'])['knowledge_point'].groups[key]
        tmp = course4_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course4_exam[section] = tmp.apply(np.sum,axis = 1)
    elif course == 'course5':
        s0_inde = all_know.groupby(['course','section'])['knowledge_point'].groups[key]
        tmp = course5_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course5_exam[section] = tmp.apply(np.sum,axis = 1)
    elif course == 'course6':
        s0_inde = all_know.groupby(['course','section'])['knowledge_point'].groups[key]
        tmp = course6_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course6_exam[section] = tmp.apply(np.sum,axis = 1)
    elif course == 'course7':
        s0_inde = all_know.groupby(['course','section'])['knowledge_point'].groups[key]
        tmp = course7_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course7_exam[section] = tmp.apply(np.sum,axis = 1)
    elif course == 'course8':
        s0_inde = all_know.groupby(['course','section'])['knowledge_point'].groups[key]
        tmp = course8_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course8_exam[section] = tmp.apply(np.sum,axis = 1)
for key in all_know.groupby(['course','category'])['knowledge_point'].groups.keys():
    course = key[0]
    section = key[1]
    if course == 'course1':
        s0_inde = all_know.groupby(['course','category'])['knowledge_point'].groups[key]
        tmp = course1_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course1_exam[section] = tmp.apply(np.sum,axis = 1)
    elif course == 'course2':
        s0_inde = all_know.groupby(['course','category'])['knowledge_point'].groups[key]
        tmp = course2_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course2_exam[section] = tmp
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值