数据比赛代码记录

最新推荐文章于 2021-11-23 10:38:39 发布

龙海L

最新推荐文章于 2021-11-23 10:38:39 发布

阅读量201

点赞数

分类专栏： python 机器学习算法文章标签：算法机器学习 python

本文链接：https://blog.csdn.net/qq_36523203/article/details/106430374

版权

数据比赛里用写的代码，可快速应用到其他比赛

使用库

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold #数据进行交叉验证的，模型选择时使用
from lightgbm import LGBMClassifier#lightgbm做分类的模型
from lightgbm import LGBMRegressor#lightgbm做回归预测
from sklearn.metrics import f1_score#模型评价指标函数 f1
from sklearn.metrics import mean_squared_error#均方误差
import matplotlib.pyplot as plt
from tqdm import tqdm
import catboost as cbt#模型catboost
from sklearn.cluster import KMeans#聚类算法
from sklearn.preprocessing import LabelEncoder#标签

数据处理

df_train = pd.read_csv('/home/kesci/input/smart_edu7557/exam_score.csv')
df_train = df_train[(df_train['score']!=0)].reset_index(drop=True)#将某特征满足某条件的样本删除
process_index = lambda x: list(x[((x >= (np.percentile(x,25) - 1.5*(np.percentile(x,75)-np.percentile(x,25)))) & 
(x <= (np.percentile(x,75) + 1.5*(np.percentile(x,75)-np.percentile(x,25)))))].index)
tmp_process = df_train.groupby(by=['student_id','course'], as_index=False)['score'].agg({
   'process_index':process_index})
df_test = pd.read_csv('/home/kesci/input/smart_edu7557/submission_s2.csv')
df_test.rename(columns={
   'pred':'score'},inplace = True)
course_class = pd.read_csv('/home/kesci/input/smart_edu7557/course.csv')
student = pd.read_csv('/home/kesci/input/smart_edu7557/student.csv')
all_know = pd.read_csv('/home/kesci/input/smart_edu7557/all_knowledge.csv')
df_all = df_train.append(df_test)
df_all = df_all.merge(course_class, on='course', how='left')
df_all = df_all.merge(student, on='student_id', how='left')

特征工程

增加数据样本的特征维度，找到与目标相关性更强的特征

from sklearn.metrics.pairwise import cosine_distances
course1_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course1_exams.csv')
course2_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course2_exams.csv')
course3_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course3_exams.csv')
course4_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course4_exams.csv')
course5_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course5_exams.csv')
course6_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course6_exams.csv')
course7_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course7_exams.csv')
course8_exam = pd.read_csv('/home/kesci/input/smart_edu7557/course8_exams.csv')
#col_c1 = [i for i in course1_exam.columns if i not in ['course','exam_id']]
tmp4=1
for i in [course1_exam,course2_exam,course3_exam,course4_exam,course5_exam,course6_exam,course7_exam,course8_exam]:
    name = i
    col_c1 = [i for i in name.columns if i not in ['course','exam_id']]
    name['course'] ='course'+str(tmp4)
    tmp2 =np.array(all_know.loc[all_know['course'] == ('course'+str(tmp4)),:]['complexity'])
    tmp = name[col_c1]
    tmp3 =np.dot(tmp.values,tmp2)
    name['hard'] = tmp3
    name['hard_inverse'] = name['hard'].apply(lambda x:1/(x+1e-10))
    tmp4 = tmp4+1
    
    np_tmp = name[col_c1].values
    np_tmp= np_tmp.astype(np.bool)
    np_tmp2 = np.sum(np_tmp,axis=1)
    np_tmp = np.sum(np_tmp,axis=1)/len(col_c1)
    np_tmp = (np_tmp-np_tmp.min())/(np_tmp.max()-np_tmp.min())
    name['ration_know'] = np_tmp
    name['number_know'] = np_tmp2
    #添加每个知识点的均分
    name['know_mean']=[100/i for i in np_tmp2]
    
for i in [course1_exam,course2_exam,course3_exam,course4_exam,course5_exam,course6_exam,course7_exam,course8_exam]:
    name = i
    col_c1 = [i for i in name.columns if i not in ['course','exam_id','hard','hard_inverse','ration_know','number_know']]
    pd_ = name[col_c1]
    e = 1-cosine_distances(pd_)
    e = e -np.diag([1] * len(name))
    inde = np.argmax(e,axis=1)
    pd_['inde'] = inde
    name['sim_exam_id'] = pd_['inde'].apply(lambda x:i.loc[x,'exam_id'])
    
for key in all_know.groupby(['course','section'])['knowledge_point'].groups.keys():
    course = key[0]
    section = key[1]
    if course == 'course1':
        s0_inde = all_know.groupby(['course','section'])['knowledge_point'].groups[key]
        tmp = course1_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course1_exam[section] = tmp.apply(np.sum,axis = 1)
    elif course == 'course2':
        s0_inde = all_know.groupby(['course','section'])['knowledge_point'].groups[key]
        tmp = course2_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course2_exam[section] = tmp.apply(np.sum,axis = 1)
    elif course == 'course3':
        s0_inde = all_know.groupby(['course','section'])['knowledge_point'].groups[key]
        tmp = course3_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course3_exam[section] = tmp.apply(np.sum,axis = 1)
    elif course == 'course4':
        s0_inde = all_know.groupby(['course','section'])['knowledge_point'].groups[key]
        tmp = course4_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course4_exam[section] = tmp.apply(np.sum,axis = 1)
    elif course == 'course5':
        s0_inde = all_know.groupby(['course','section'])['knowledge_point'].groups[key]
        tmp = course5_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course5_exam[section] = tmp.apply(np.sum,axis = 1)
    elif course == 'course6':
        s0_inde = all_know.groupby(['course','section'])['knowledge_point'].groups[key]
        tmp = course6_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course6_exam[section] = tmp.apply(np.sum,axis = 1)
    elif course == 'course7':
        s0_inde = all_know.groupby(['course','section'])['knowledge_point'].groups[key]
        tmp = course7_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course7_exam[section] = tmp.apply(np.sum,axis = 1)
    elif course == 'course8':
        s0_inde = all_know.groupby(['course','section'])['knowledge_point'].groups[key]
        tmp = course8_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course8_exam[section] = tmp.apply(np.sum,axis = 1)
for key in all_know.groupby(['course','category'])['knowledge_point'].groups.keys():
    course = key[0]
    section = key[1]
    if course == 'course1':
        s0_inde = all_know.groupby(['course','category'])['knowledge_point'].groups[key]
        tmp = course1_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course1_exam[section] = tmp.apply(np.sum,axis = 1)
    elif course == 'course2':
        s0_inde = all_know.groupby(['course','category'])['knowledge_point'].groups[key]
        tmp = course2_exam[list(all_know.iloc[s0_inde]['knowledge_point'])]
        course2_exam[section] = tmp

最低0.47元/天解锁文章

龙海L

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
数据比赛代码记录

数据比赛里用写的代码，可快速应用到其他比赛使用库数据处理特征工程模型训练和预测使用库import pandas as pdimport numpy as npfrom sklearn.model_selection import StratifiedKFold #数据进行交叉验证的，模型选择时使用from lightgbm import LGBMClassifier#lightgbm做分类的模型from lightgbm import LGBMRegressor#lightgbm做回归预测fr
复制链接

扫一扫