SIMCSE(Simple Contrastive Learning of Sentence Embeddings)是一种用于生成句子嵌入的方法,既可以进行有监督训练,也可以进行无监督训练。以下是两个代码模板,分别展示了如何进行有监督和无监督的SIMCSE训练。
无监督SIMCSE训练代码模板
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel
class SimCSE(nn.Module):
def __init__(self, model_name):
super(SimCSE, self).__init__()
self.bert = BertModel.from_pretrained(model_name)
def forward(self, input_ids, attention_mask):
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
cls_output = outputs.last_hidden_state[:, 0, :] # 取 [CLS] token 的输出
return cls_output
def simcse_unsup_loss(embeddings, temperature=0.05):
embeddings = nn.functional.normalize(embeddings, dim=1)
similarity_matrix = torch.matmul(embeddings, embeddings.T) / temperature
labels = torch.arange(embeddings.size(0)).to(embeddings.device)
loss = nn.CrossEntropyLoss()(similarity_matrix, labels)
return loss
# 超参数设置
model_name = 'bert-base-uncased'
learning_rate = 1e-5
num_epochs = 3
batch_size = 32
# 初始化模型、优化器和数据加载器
tokenizer = BertTokenizer.from_pretrained(model_name)
model = SimCSE(model_name).cuda()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
# 假设data_loader已经定义好
for epoch in range(num_epochs):
model.train()
for batch in data_loader:
input_ids = batch['input_ids'].cuda()
attention_mask = batch['attention_mask'].cuda()
embeddings = model(input_ids, attention_mask)
loss = simcse_unsup_loss(embeddings)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f'Epoch {epoch + 1}, Loss: {loss.item()}')
有监督SIMCSE训练代码模板
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel
class SimCSE(nn.Module):
def __init__(self, model_name):
super(SimCSE, self).__init__()
self.bert = BertModel.from_pretrained(model_name)
def forward(self, input_ids, attention_mask):
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
cls_output = outputs.last_hidden_state[:, 0, :] # 取 [CLS] token 的输出
return cls_output
def simcse_sup_loss(embeddings, labels, temperature=0.05):
embeddings = nn.functional.normalize(embeddings, dim=1)
similarity_matrix = torch.matmul(embeddings, embeddings.T) / temperature
loss = nn.CrossEntropyLoss()(similarity_matrix, labels)
return loss
# 超参数设置
model_name = 'bert-base-uncased'
learning_rate = 1e-5
num_epochs = 3
batch_size = 32
# 初始化模型、优化器和数据加载器
tokenizer = BertTokenizer.from_pretrained(model_name)
model = SimCSE(model_name).cuda()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
# 假设data_loader已经定义好
for epoch in range(num_epochs):
model.train()
for batch in data_loader:
input_ids = batch['input_ids'].cuda()
attention_mask = batch['attention_mask'].cuda()
labels = batch['labels'].cuda() # 有监督情况下的标签
embeddings = model(input_ids, attention_mask)
loss = simcse_sup_loss(embeddings, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f'Epoch {epoch + 1}, Loss: {loss.item()}')
- 模型定义:
SimCSE
类定义了一个简单的基于 BERT 的模型,提取 [CLS] token 的输出作为句子嵌入。 - 无监督损失函数:
simcse_unsup_loss
函数计算无监督对比学习的损失。使用 normalized 的嵌入计算相似度矩阵,并与标签计算交叉熵损失。 - 有监督损失函数:
simcse_sup_loss
函数计算有监督对比学习的损失。与无监督损失类似,但在计算交叉熵损失时使用真实标签。 - 训练循环: 对于每个 epoch 和 batch,提取输入、计算嵌入、计算损失并执行优化步骤。