数据集
数据集格式(以assistments12为例)原始数据集
文章中使用的是经过处理的数据
主要包括用户id,问题编号,回答问题的时间,是否回答正确,这个问题所包含的技能id
#按照制表符分割读出
full_df = pd.read_csv(os.path.join('data', args.dataset, 'preprocessed_data.csv'), sep="\t")
主要思想
本文提出了一种基于自注意力机制的知识追踪模型 Self Attentive Knowledge Tracing (SAKT)。其本质是用 Transformer 的 encoder 部分来做序列任务。利用注意力机制,考虑当前要预测的问题与以往所回答问题的相关性 ,根据这一点将对相关经验进行加权和然后在预测。链接: PDF
数据的处理过程
1.分别读出训练数据和测试数据,并将训练数据按照8:2分为训练集和验证集
full_df = pd.read_csv(os.path.join('data', args.dataset, 'preprocessed_data.csv'), sep="\t")
train_df = pd.read_csv(os.path.join('data', args.dataset, 'preprocessed_data_train.csv'), sep="\t")
test_df = pd.read_csv(os.path.join('data', args.dataset, 'preprocessed_data_test.csv'), sep="\t")
train_data, val_data = get_data(train_df, args.max_length)
get-data 的详细实现过程
def get_data(df, max_length, train_split=0.8, randomize=True):
"""Extract sequences from dataframe.
Arguments:
df (pandas Dataframe): output by prepare_data.py
max_length (int): maximum length of a sequence chunk
train_split (float): proportion of data to use for training
"""
按照用户id,把技能,问题编号,正确与否进行分类存储
item_ids = [torch.tensor(u_df["item_id"].values, dtype=torch.long) #将数据集按照用户id进行分类
for _, u_df in df.groupby("user_id")]
skill_ids = [torch.tensor(u_df["skill_id"].values, dtype=torch.long)
for _, u_df in df.groupby("user_id")]
labels = [torch.tensor(u_df["correct"].values, dtype=torch.long)
for _, u_df in df.groupby("user_id")]
把列表中的每个元素加一,并和0并按行拼接,并且不要最后一个元素.
这一部分对应论文中的方便进行模型预测
item_inputs = [torch.cat((torch.zeros(1, dtype=torch.long), i + 1))[:-1] for i in item_ids]
skill_inputs = [torch.cat((torch.zeros(1, dtype=torch.long), s + 1))[:-1] for s in skill_ids]
label_inputs = [torch.cat((torch.zeros(1, dtype=torch.long), l))[:-1] for l in labels]
按照最大长度100来划分序列
def chunk(list):
if list[0] is None:
return list
list = [torch.split(elem, max_length) for elem in list]
return [elem for sublist in list for elem in sublist] #对于大于100的序列采用双重循环再次展开
# Chunk sequences
lists = (item_inputs, skill_inputs, label_inputs, item_ids, skill_ids, labels)
chunked_lists = [chunk(l) for l in lists]
data = list(zip(*chunked_lists)) #列表对应进行合并
if randomize:
shuffle(data)
# Train-test split across users
train_size = int(train_split * len(data))
train_data, val_data = data[:train_size], data[train_size:] #划分训练集 验证集
return train_data, val_data
得到的itemi_d如下:是一个列表,每个元素是一个张量,表示用户id相同的人所做的题目编号集合。 skill_ids, labels表示意思大致相同。
返回的train_data,val_data。形式如下:
模型构建:
model = SAKT(num_items, num_skills, args.embed_size, args.num_attn_layers, args.num_heads,
args.encode_pos, args.max_pos, args.drop_prob).cuda()
class SAKT(nn.Module):
def __init__(self, num_items, num_skills, embed_size, num_attn_layers, num_heads,
encode_pos, max_pos, drop_prob):
"""Self-attentive knowledge tracing.
Arguments:
num_items (int): number of items
num_skills (int): number of skills
embed_size (int): input embedding and attention dot-product dimension
num_attn_layers (int): number of attention layers 几层注意力。默认是一层,可以自已设置。
num_heads (int): number of parallel attention heads
encode_pos (bool): if True, use relative position embeddings 位置嵌入
max_pos (int): number of position embeddings to use
drop_prob (float): dropout probability
"""
super(SAKT, self).__init__()
self.embed_size = embed_size
self.encode_pos = encode_pos
self.item_embeds = nn.Embedding(num_items + 1, embed_size // 2, padding_idx=0)
self.skill_embeds = nn.Embedding(num_skills + 1, embed_size // 2, padding_idx=0)
self.pos_key_embeds = nn.Embedding(max_pos, embed_size // num_heads) #位置嵌入
self.pos_value_embeds = nn.Embedding(max_pos, embed_size // num_heads)
self.lin_in = nn.Linear(2 * embed_size, embed_size)
self.attn_layers = clone(MultiHeadedAttention(embed_size, num_heads, drop_prob), num_attn_layers) #多头注意力机制
self.dropout = nn.Dropout(p=drop_prob)
self.lin_out = nn.Linear(embed_size, 1) #输出层
def get_inputs(self, item_inputs, skill_inputs, label_inputs):
item_inputs = self.item_embeds(item_inputs) #(bs,sl,100)
skill_inputs = self.skill_embeds(skill_inputs) #(bs,sl,100)
label_inputs = label_inputs.unsqueeze(-1).float() #(bs,sl,1) 新加一个维度
inputs = torch.cat([item_inputs, skill_inputs, item_inputs, skill_inputs], dim=-1)#(bs,sl,400)
inputs[..., :self.embed_size] *= label_inputs #(bs,max_sl,400) 论文中所述构建交互矩阵
inputs[..., self.embed_size:] *= 1 - label_inputs
return inputs #(bs,sl,400)
def get_query(self, item_ids, skill_ids):
item_ids = self.item_embeds(item_ids) #(bs,sl,100)
skill_ids = self.skill_embeds(skill_ids)#(bs,sl,100)
query = torch.cat([item_ids, skill_ids], dim=-1) #(bs,sl,200)
return query
def forward(self, item_inputs, skill_inputs, label_inputs, item_ids, skill_ids):
inputs = self.get_inputs(item_inputs, skill_inputs, label_inputs) #(bs,sl,400)
inputs = F.relu(self.lin_in(inputs)) #(bs,sl,200) 降维处理
query = self.get_query(item_ids, skill_ids) #(bs,sl,200)
mask = future_mask(inputs.size(-2)) #(1,sl,sl) 构建掩码
if inputs.is_cuda:
mask = mask.cuda()
outputs = self.dropout(self.attn_layers[0](query, inputs, inputs, self.encode_pos,
self.pos_key_embeds, self.pos_value_embeds, mask))
for l in self.attn_layers[1:]:
residual = l(query, outputs, outputs, self.encode_pos, self.pos_key_embeds,
self.pos_value_embeds, mask)
outputs = self.dropout(outputs + F.relu(residual)) #如果是多层注意力机制会加上残差连接
return self.lin_out(outputs)
多头注意力代码:
class MultiHeadedAttention(nn.Module):
def __init__(self, total_size, num_heads, drop_prob):
super(MultiHeadedAttention, self).__init__()
assert total_size % num_heads == 0
self.total_size = total_size
self.head_size = total_size // num_heads
self.num_heads = num_heads
self.linear_layers = clone(nn.Linear(total_size, total_size), 3)
self.dropout = nn.Dropout(p=drop_prob)
def forward(self, query, key, value, encode_pos, pos_key_embeds, pos_value_embeds, mask=None):
batch_size, seq_length = query.shape[:2]
# Apply mask to all heads
if mask is not None:
mask = mask.unsqueeze(1)
# Project inputs
query, key, value = [l(x).view(batch_size, seq_length, self.num_heads, self.head_size).transpose(1, 2)
for l, x in zip(self.linear_layers, (query, key, value))] #Q,K,V映射的产生
# Apply attention
if encode_pos:
out, self.prob_attn = relative_attention(
query, key, value, pos_key_embeds, pos_value_embeds, mask, self.dropout)
else:
out, self.prob_attn = attention(query, key, value, mask, self.dropout)
out = out.transpose(1, 2).contiguous().view(batch_size, seq_length, self.total_size) #bs ,sl ,200 多头拼接
return out
注意力机制的代码:
def attention(query, key, value, mask=None, dropout=None):
"""Compute scaled dot product attention.
"""
scores = torch.matmul(query, key.transpose(-2, -1))
scores = scores / math.sqrt(query.size(-1))
if mask is not None:
scores = scores.masked_fill(mask, -1e9) #掩码,先做的题目不能和
prob_attn = F.softmax(scores, dim=-1)
if dropout is not None:
prob_attn = dropout(prob_attn)
return torch.matmul(prob_attn, value), prob_attn
计算训练损失与训练auc
preds = model(item_inputs, skill_inputs, label_inputs, item_ids, skill_ids)
loss = compute_loss(preds, labels.cuda(), criterion)
preds = torch.sigmoid(preds).detach().cpu()
train_auc = compute_auc(preds, labels)