import random
import time
import os
import datetime
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
import numpy as np
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
# 配置config
class TrainConfig(object):
epochs = 10
decay_rate = 0.92
learning_rate = 0.01
evaluate_every = 100
checkpoint_every = 100
max_grad_norm = 3.0
class ModelConfig(object):
hidden_layers = [200]
dropout_keep_prob = 0.6
class Config(object):
batch_size = 32
num_skills = 124
input_size = num_skills * 2
trainConfig = TrainConfig()
modelConfig = ModelConfig()
# 实例化config
config = Config()
# 生成数据
class DataGenerator(object):
# 导入的seqs是train_seqs,或者是test_seqs
def __init__(self, fileName, config):
self.fileName = fileName
self.train_seqs = []
self.test_seqs = []
self.infer_seqs = []
self.batch_size = config.batch_size
self.pos = 0
self.end = False
self.num_skills = config.num_skills
self.skills_to_int = {
} # 知识点到索引的映射
self.int_to_skills = {
} # 索引到知识点的映射
def read_file(self):
# 从文件中读取数据,返回读取出来的数据和知识点个数
# 保存每个学生的做题信息 {学生id: [[知识点id,答题结果], [知识点id,答题结果], ...]},用一个二元列表来表示一个学生的答题信息
seqs_by_student = {
}
skills = [] # 统计知识点的数量,之后输入的向量长度就是两倍的知识点数量
count = 0
with open(self.fileName, 'r') as f:
for line in f:
fields = line.strip().split(" ") # 一个列表,[学生id,知识点id,答题结果]
student, skill, is_correct = int(fields[0]), int(fields[1]), int(fields[2])
skills.append(skill) # skill实际上是用该题所属知识点来表示的
seqs_by_student[student] = seqs_by_student.get(student, []) + [[skill, is_correct]] # 保存每个学生的做题信息
return seqs_by_student, list(set(skills))
def gen_dict(self, unique_skills):
sorted_skills = sorted(unique_skills)
skills_to_int = {
}
int_to_skills = {
}
for i in range(len(sorted_skills)):
skills_to_int[sorted_skills[i]] = i
int_to_skills[i] = sorted_skills[i]
self.skills_to_int = skills_to_int
self.int_to_skills = int_to_skills
def split_dataset(self, seqs_by_student, sample_rate=0.2, random_seed=1):
# 将数据分割成测试集和训练集
sorted_keys = sorted(seqs_by_student.keys()) # 得到排好序的学生id的列表
random.seed(random_seed)
# 随机抽取学生id,将这部分学生作为测试集
test_keys = set(random.sample(sorted_keys, int(len(sorted_keys) * sample_rate)))
# 此时是一个三层的列表来表示的,最外层的列表中的每一个列表表示一个学生的做题信息
test_seqs = [seqs_by_student[k] for k in seqs_by_student if k in test_keys]
train_seqs = [seqs_by_student[k] for k in seqs_by_student if k not in test_keys]
return train_seqs, test_seqs
def gen_attr(self, is_infer=False):
if is_infer:
seqs_by_students, skills = self.read_file()
self.infer_seqs = seqs_by_students
else:
seqs_by_students, skills = self.read_file()
train_seqs, test_seqs = self.split_dataset(seqs_by_students)
self.train_seqs = train_seqs
self.test_seqs = test_seqs
self.gen_dict(skills) # 生成知识点到索引的映射字典
def pad_sequences(self, sequences, maxlen=None, value=0.):
# 按每个batch中最长的序列进行补全, 传入的sequences是二层列表
# 统计一个batch中每个序列的长度,其实等于seqs_len
lengths = [len(s) for s in sequences]
# 统计下该batch中序列的数量
nb_samples = len(sequences)
# 如果没有传入maxlen参数就自动获取最大的序列长度
if maxlen is None:
maxlen = np.max(lengths)
# 构建x矩阵
x = (np.ones
DKT模型(旧)
最新推荐文章于 2022-11-27 16:52:38 发布
本文深入探讨了DKT(Dynamic Knowledge Tracing)模型,这是一种用于追踪学生学习过程的深度学习方法。DKT通过建模学生的回答序列,理解他们在特定概念上的掌握情况。内容包括模型的工作原理、优缺点以及在教育领域的应用实例。
摘要由CSDN通过智能技术生成