基于BERT类的MRPC语义相似度检测(从0到-1系列)
介绍
BERT(Bidirectional Encoder Representations from Transformers)是由Google开发的一种预训练模型,它是一种基于Transformer机制的深度双向模型,可以对大规模文本数据进行无监督预训练,然后在各种NLP任务中进行微调。BERT 模型在多项自然语言处理任务上取得了巨大成功,成为了NLP 领域里的一个重要里程碑。
语意相似度检测任务是NLP领域中一项重要的任务,它旨在判断两个文本片段之间的语义相似程度。在这项任务中,我们需要输入两个文本片段,然后输出一个相似度分数,表示这两个文本片段之间的语义相似程度。语义相似度检测任务在很多NLP应用中非常有用,比如信息检索、问答系统、自动摘要等领域。
BERT 在语意相似度检测任务中的应用非常成功,通过将两个文本片段拼接并输入到BERT模型中,可以得到两个文本片段的语义表示,然后通过一些微调层或者特定的输出层进行语义相似度的判断。BERT 模型具有强大的语义表达能力,可以有效地捕捉文本之间的语义信息,因此在语义相似度检测任务中取得了很好的效果。
数据预处理
分析数据基本结构
Quality #1 ID #2 ID #1 String #2 String
1 702876 702977 Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence . Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .
0 2108705 2108831 Yucaipa owned Dominick 's before selling the chain
to Safeway in 1998 for $ 2.5 billion . Yucaipa bought Dominick 's in
1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in
1998 .
数据由五个列组合而成,其中 ID 这一属性对该任务无明显作用,故使用 py 脚本剔除,python脚本如下:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author: bruanlin
# datetime: 2025/4/23 13:44
# software: PyCharm
# project: [Bert-mrpc]-[data_trasform]
""" Code Describe :
nothing!!
"""
import pandas as pd
from typing import Tuple
import logging
import os
from pathlib import Path
# 配置日志记录
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def load_mrpc_data(file_path: str) -> Tuple[pd.DataFrame, int]:
"""
加载MRPC数据集
"""
try:
# 显式指定列类型防止自动类型推断错误
dtype_spec = {
'#1 ID': 'string',
'#2 ID': 'string',
'#1 String': 'string',
'#2 String': 'string',
'Quality': 'int8'
}
# 分块读取大文件(适用于Colab内存受限情况)
chunks = pd.read_csv(
file_path,
sep='\t',
header=0,
dtype=dtype_spec,
usecols=['Quality', '#1 String', '#2 String'], # 明确指定需要列
chunksize=1000,
on_bad_lines='warn' # 跳过格式错误行
)
df = pd.concat(chunks)
except FileNotFoundError:
logger.error(f"文件未找到: {file_path}")
raise
except pd.errors.ParserError as e:
logger.error(f"TSV解析错误: {str(e)}")
raise
# 记录初始行数
original_rows = df.shape[0]
# 数据清洗
df = df.dropna(subset=['#1 String', '#2 String']) # 删除空值行
df = df.rename(columns={
'Quality': 'label',
'#1 String': 'sentence1',
'#2 String': 'sentence2'
})
# 记录清洗结果
cleaned_rows = df.shape[0]
logger.info(f"数据清洗完成: 原始行数={original_rows}, 有效行数={cleaned_rows}, "
f"丢弃行数={original_rows - cleaned_rows}")
return df.reset_index(drop=True), (original_rows - cleaned_rows)
def save_processed_data(df: pd.DataFrame,
output_dir = "processed_data",
file_name = "mrpc_processed",
formats ='tsv') -> None:
"""
保存数据
"""
# 创建输出目录
Path(output_dir).mkdir(parents=True, exist_ok=True)
# 确保列顺序正确
df = df[['label', 'sentence1', 'sentence2']]
# 保存不同格式
try:
full_path = os.path.join(output_dir, f"{file_name}.{formats}")
if formats == 'tsv':
# TSV格式(默认格式)
df.to_csv(
full_path,
sep='\t',
index=False,
header=True, # 根据需求调整
quoting=3, # 与原始数据一致
escapechar='\\'
)
elif formats == 'csv':
# CSV格式(带列名)
df.to_csv(
full_path,
index=False,
quotechar='"',
escapechar='\\'
)
else:
raise ValueError(f"不支持的格式: {formats}")
print(f"成功保存 {full_path} ({os.path.getsize(full_path) / 1024:.1f}KB)")
except Exception as e:
print(f"保存 {formats} 格式失败: {str(e)}")
# 使用示例
if __name__ == "__main__":
try:
train_df, train_dropped = load_mrpc_data("DATASET/MRPC/train.tsv")
dev_df, dev_dropped = load_mrpc_data("DATASET/MRPC/dev.tsv")
# 展示样本结构
print("\n训练集样例:")
print(train_df[['label', 'sentence1', 'sentence2']].head(3))
# 数据分布分析
print("\n标签分布:")
print(train_df['label'].value_counts(normalize=True))
# 保存数据
try:
# 保存训练集数据
save_processed_data(train_df,"./DATASET/train_processed","train_processed_mrpc","tsv")
save_processed_data(dev_df,"./DATASET/dev_processed","dev_processed_mrpc","tsv")
except Exception as e:
logger.error(f"保存数据失败: {e}")
except Exception as e:
logger.error("数据加载失败,请检查文件路径和格式")
剔除后的实验数据如下:
label sentence1 sentence2
1 He said the foodservice pie business doesn 't fit the company 's
long-term growth strategy . The foodservice pie business does not fit
our long-term growth strategy .
0 The dollar was at 116.92 yen against the yen , flat on
the session , and at 1.2891 against the Swiss franc , also flat . The
dollar was at 116.78 yen JPY = , virtually flat on the session , and
at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
创建 dataset 和 dataloader
# 参数设置
Max_length = 128
Batch_size = 32
Lr = 2e-5
Epochs = 10
Model_name = 'bert-base-uncased'
train_tsv_path = "/kaggle/input/mrpc-bert/train_processed_mrpc.tsv"
dev_tsv_path = "/kaggle/input/mrpc-bert/dev_processed_mrpc.tsv"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# 加载
def load_mrpc_data(file_path):
"""加载并预处理MRPC TSV文件"""
try:
chunks = pd.read_csv(
file_path,
sep='\t',
header=0,
dtype=str,
usecols=['label', 'sentence1', 'sentence2'], # 关键修改点
chunksize=1000,
on_bad_lines='warn'
)
df = pd.concat(chunks)
# 类型转换
df['label'] = pd.to_numeric(df['label'], errors='coerce').astype('Int8')
df = df.dropna().reset_index(drop=True)
return df
except Exception as e:
print(f"Error: {e}")
return None # 确保异常时返回明确空值
# 创建自定义Dataset
class MRPCDataset(Dataset):
def __init__(self, dataframe, tokenizer, max_len):
self.data = dataframe
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.data)
def __getitem__(self, index):
# 提取单行数据
row = self.data.iloc[index]
sentence1 = str(row['sentence1'])
sentence2 = str(row['sentence2'])
label = int(row['label'])
# 分词与编码
encoding = self.tokenizer.encode_plus(
text=sentence1,
text_pair=sentence2,
add_special_tokens=True, # 添加[CLS], [SEP]
max_length=self.max_len,
padding='max_length',
truncation=True,
return_tensors='pt', # 返回PyTorch Tensor
return_token_type_ids=True,
return_attention_mask=True
)
return {
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'token_type_ids': encoding['token_type_ids'].flatten(),
'label': torch.tensor(label, dtype=torch.long)
}
# 初始化组件
tokenizer = BertTokenizer.from_pretrained(Model_name,
cache_dir="./huggingface_models", # 指定缓存目录
mirror='https://mirror.sjtu.edu.cn/huggingface' # 上海交大镜像
)
df = load_mrpc_data(train_tsv_path)
dataset = MRPCDataset(df, tokenizer, max_len=Max_length)
# 创建DataLoader
dataloader = DataLoader(
dataset,
batch_size=Batch_size,
shuffle=True,
num_workers=4, # 多进程加载
pin_memory=True # 加速GPU传输
)
# 加载验证集
dev_df = load_mrpc_data(dev_tsv_path)
dev_dataset = MRPCDataset(dev_df, tokenizer, Max_length)
dev_dataloader = DataLoader(
dev_dataset,
batch_size=Batch_size,
shuffle=False,
num_workers=2
)
训练
训练指标
训练平台 kaggle-GPUT4
优化器 Adamw
Max_length 128
Batch_size 32
Lr 2e-5
Epochs 30
# ====================创建训练组件===============
model = BertForSequenceClassification.from_pretrained(
Model_name,
num_labels=2, # 二分类任务
force_download=True, # 强制重新下载
mirror='https://mirror.sjtu.edu.cn/huggingface',
cache_dir="./huggingface_models"
)
model.to(device)
# ===============优化器 ==================
# 修改优化器初始化代码
optimizer = AdamW(model.parameters(), lr=Lr, weight_decay=0.01)
total_steps = len(dataloader) * Epochs
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0,
num_training_steps=total_steps
)
训练函数
def train_epoch(model, dataloader, optimizer, scheduler, device):
model.train()
total_loss = 0
progress_bar = tqdm(dataloader, desc="Training", leave=False)
for batch in progress_bar:
optimizer.zero_grad()
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
token_type_ids = batch['token_type_ids'].to(device)
labels = batch['label'].to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
labels=labels
)
loss = outputs.loss
total_loss += loss.item()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
scheduler.step()
progress_bar.set_postfix({'loss': f"{loss.item():.4f}"})
return total_loss / len(dataloader)
验证函数
def evaluate(model, dataloader, device):
model.eval()
total_loss = 0
predictions = []
true_labels = []
with torch.no_grad():
for batch in tqdm(dataloader, desc="Evaluating", leave=False):
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
token_type_ids = batch['token_type_ids'].to(device)
labels = batch['label'].to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
labels=labels
)
loss = outputs.loss
total_loss += loss.item()
logits = outputs.logits
preds = torch.argmax(logits, dim=1)
predictions.extend(preds.cpu().numpy())
true_labels.extend(labels.cpu().numpy())
accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)
avg_loss = total_loss / len(dataloader)
return avg_loss, accuracy, f1
if __name__ == "__main__":
# 查看第一个batch
sample_batch = next(iter(dataloader))
print(f"Batch输入尺寸:")
print(f"Input IDs: {sample_batch['input_ids'].shape}")
print(f"Attention Mask: {sample_batch['attention_mask'].shape}")
print(f"Token Type IDs: {sample_batch['token_type_ids'].shape}")
print(f"Labels: {sample_batch['label'].shape}")
# 输出示例
print("\n解码第一个样本:")
print(tokenizer.decode(sample_batch['input_ids'][0]))
print(f"======== training model ===========")
# 主训练循环
# 初始化指标存储列表
metrics_data = []
best_f1 = 0
for epoch in range(Epochs):
print(f"\nEpoch {epoch + 1}/{Epochs}")
print("-" * 40)
# 训练阶段
train_loss = train_epoch(model, dataloader, optimizer, scheduler, device)
print(f"Train Loss: {train_loss:.4f}")
# 验证阶段
val_loss, val_acc, val_f1 = evaluate(model, dev_dataloader, device)
print(f"Val Loss: {val_loss:.4f} | Accuracy: {val_acc:.4f} | F1: {val_f1:.4f}")
# 记录指标
metrics_data.append({
'Epoch': epoch + 1,
'Train Loss': round(train_loss, 4),
'Val Loss': round(val_loss, 4),
'Accuracy': round(val_acc, 4),
'F1 Score': round(val_f1, 4)
})
metrics_df = pd.DataFrame([metrics_data[-1]]) # 只取最新数据
if epoch == 0:
metrics_df.to_excel("training_metrics.xlsx", index=False)
else:
with pd.ExcelWriter("training_metrics.xlsx", mode='a', engine='openpyxl',
if_sheet_exists='overlay') as writer:
metrics_df.to_excel(writer, index=False, header=False, startrow=epoch + 1)
# 保存最佳模型
if val_f1 > best_f1:
best_f1 = val_f1
model.save_pretrained("./best_model")
tokenizer.save_pretrained("./best_model")
print(f"New best model saved with F1: {val_f1:.4f}")
print("\nTraining completed!")
print(f"Best Validation F1: {best_f1:.4f}")
总体结果与实验总结
采用预训练模型bert-base-uncased,在数据集 MRPC 上测试得到的情况如下:
Train Loss | 0.0024 |
---|---|
Val Loss | 1.6600 |
Accuracy | 0.8247 |
Val Loss | 0.8811 |
F1 | 0.0024 |
Best Validation F1 | 0.8862 |
分析比对现有方法,形成简单的总结报告,想办法提升性能(不一定要比所有方法都好,接近即可),制作表格,自己方法与其他方法的性能对比。
相似度检测任务的现有方法
模型对比
训练结果如下,超参数设置如下
总结对比
优化改进
数据 | 数据增强 |
---|---|
训练过程 | AdamW 分层学习率设置 |
训练过程 | 超参数搜索 |
模型 | 模型集成 |
数据增强
分层学习率
def create_model(model_name, num_labels=2):
"""创建带分层参数的模型"""
model = AutoModelForSequenceClassification.from_pretrained(
model_name,
num_labels=num_labels
)
# 参数分组
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
"params": [p for n, p in model.named_parameters()
if "embeddings" in n and not any(nd in n for nd in no_decay)],
"lr": config['lr'] * 0.1,
"weight_decay": 0.0
},
{
"params": [p for n, p in model.named_parameters()
if "classifier" in n],
"lr": config['lr'] * 10,
"weight_decay": 0.01
}
]
return model, optimizer_grouped_parameters
● 分层逻辑
○ 排除动态学习率
■ Bias参数:偏移量不需要正则化(过大的L2惩罚会降低模型灵活性)
■ LayerNorm参数:标准化层权重已自带缩放机制,额外正则化可能破坏分布
○ 词嵌入层的学习率设置:“lr”: config[‘lr’] * 0.1,“weight_decay”: 0.0
○ 分类器学习率设置: “lr”: config[‘lr’] * 10,“weight_decay”: 0.01
超参数搜索
def objective(trial):
"""Optuna超参数优化目标函数"""
# 超参数建议范围
config.update({
'lr': trial.suggest_float('lr', 1e-6, 5e-5, log=True),
'batch_size': trial.suggest_categorical('batch_size', [16, 32, 64]),
'aug_prob': trial.suggest_float('aug_prob', 0.1, 0.3)
})
# 初始化模型集合
models = []
optimizers = []
schedulers = []
for model_name in config['model_names']:
model, params = create_model(model_name)
model.to(device)
optimizer = AdamW(params)
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=100,
num_training_steps=len(train_loader) * config['epochs']
)
models.append(model)
optimizers.append(optimizer)
schedulers.append(scheduler)
# 训练循环
best_f1 = 0
for epoch in range(config['epochs']):
for model, optimizer, scheduler in zip(models, optimizers, schedulers):
train_epoch(model, train_loader, optimizer, scheduler, device)
metrics = evaluate_ensemble(models, dev_loader, device)
trial.report(metrics['f1'], epoch)
if metrics['f1'] > best_f1:
best_f1 = metrics['f1']
if trial.should_prune():
raise optuna.TrialPruned()
return best_f1
模型集成评估
"model_names": [
'bert-base-uncased',
'roberta-base',
'google/electra-small-discriminator'
],
"ensemble_weights": [0.4, 0.3, 0.3]
def evaluate_ensemble(models, dataloader, device):
"""集成模型评估"""
all_logits = []
true_labels = []
for model in models:
model.eval()
model_logits = []
with torch.no_grad():
for batch in tqdm(dataloader):
inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
outputs = model(**inputs)
model_logits.append(outputs.logits.cpu().numpy())
if len(true_labels) == 0:
true_labels.extend(batch['label'].cpu().numpy())
all_logits.append(np.concatenate(model_logits))
# 加权集成
weighted_logits = np.zeros_like(all_logits[0])
for i, weight in enumerate(config['ensemble_weights']):
weighted_logits += all_logits[i] * weight
predictions = np.argmax(weighted_logits, axis=1)
return {
'accuracy': accuracy_score(true_labels, predictions),
'f1': f1_score(true_labels, predictions)
}