import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForTokenClassification, Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import classification_report
# 示例训练和测试数据
train_texts = [
["EU", "rejects", "German", "call", "to", "boycott", "British", "lamb", "."],
["Peter", "Blackburn", "is", "born", "in", "New", "York", "."]
]
train_labels = [
["B-ORG", "O", "B-MISC", "O", "O", "O", "B-MISC", "O", "O"],
["B-PER", "I-PER", "O", "O", "O", "B-LOC", "I-LOC", "O"]
]
test_texts = [
["Peter", "Blackburn", "was", "born", "in", "New", "York", "."]
]
test_labels = [
["B-PER", "I-PER", "O", "O", "O", "B-LOC", "I-LOC", "O"]
]
# 标签映射
label_map = {"O": 0, "B-PER": 1, "I-PER": 2, "B-ORG": 3, "I-ORG": 4, "B-MISC": 5, "I-MISC": 6, "B-LOC": 7, "I-LOC": 8}
train_labels = [[label_map[label] for label in doc] for doc in train_labels]
test_labels = [[label_map[label] for label in doc] for doc in test_labels]
# 加载BERT分词器和模型
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(label_map))
# 将数据集转换为BERT输入格式
train_encodings = tokenizer(train_texts, is_split_into_words=True, return_tensors="pt", padding=True, truncation=True)
test_encodings = tokenizer(test_texts, is_split_into_words=True, return_tensors="pt", padding=True, truncation=True)
# 定义数据集类
class NERDataset(Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: val[idx] for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
# 准备数据集
train_dataset = NERDataset(train_encodings, train_labels)
test_dataset = NERDataset(test_encodings, test_labels)
# 设置训练参数
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=3,
per_device_train_batch_size=16,
per_device_eval_batch_size=64,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=10,
)
# 初始化Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=test_dataset,
)
# 开始训练
trainer.train()
# 对测试集进行预测
predictions, labels, _ = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions, axis=2)
# 将预测标签和真实标签转换为平坦的列表
true_labels_flat = [label for doc in labels for label in doc]
pred_labels_flat = [label for doc in pred_labels for label in doc]
# 打印分类报告
target_names = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-MISC", "I-MISC", "B-LOC", "I-LOC"]
print(classification_report(true_labels_flat, pred_labels_flat, target_names=target_names))
11-09
746