提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档
一、阿里天池赛题
官网链接:https://tianchi.aliyun.com/competition/entrance/231687/information
没用到CRF,运行环境Kaggle。
有需要数据集或者帮助的可以留下联系方式或私聊。
NER任务介绍
命名实体识别(NER)是自然语言处理领域的一个基本任务,其目标是从文本中识别出具有特定意义的实体,如人名、地名、组织名等。 这些实体在文本中扮演着关键角色,对于文本理解和信息抽取具有重要意义。随着深度学习技术的发展,NER任务取得了显著的进展,各种算法和模型不断涌现,为NLP领域的研究和应用提供了强大的支持。
本部分写的是与糖尿病相关的NER任务。
二、实验步骤
1.准备数据集
代码如下(示例):
# 标签列表和映射字典
label_list = ['O', 'B-Disease', 'I-Disease', 'B-Reason', 'I-Reason', 'B-Symptom', 'I-Symptom', 'B-Test', 'I-Test',
'B-Test_Value', 'I-Test_Value', 'B-Drug', 'I-Drug', 'B-Frequency', 'I-Frequency', 'B-Amount', 'I-Amount',
'B-Treatment', 'I-Treatment', 'B-Operation', 'I-Operation', 'B-Method', 'I-Method', 'B-SideEff', 'I-SideEff', 'B-Anatomy', 'I-Anatomy', 'B-Level', 'I-Level', 'B-Duration', 'I-Duration']
label_to_id = {label: idx for idx, label in enumerate(label_list)}
def load_data(file_path):
sentences = []
labels = []
current_sentence = []
current_labels = []
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
if line.strip() == "": # 跳过空行
continue
word, label = line.strip().split()
if word == "SPACE":
if current_sentence: # 确保不添加空句子
sentences.append(current_sentence)
labels.append(current_labels)
current_sentence = []
current_labels = []
else:
current_sentence.append(word)
current_labels.append(label)
# 添加最后一段数据
if current_sentence:
sentences.append(current_sentence)
labels.append(current_labels)
return sentences, labels
def construct_dataframe(sentences, labels, label_to_id):
data = {'Sentence': [' '.join(sentence) for sentence in sentences],
'Labels': [','.join(label) for label in labels]}
df = pd.DataFrame(data)
# 将标签转换为数字型数据
df['Labels'] = df['Labels'].apply(lambda x: [label_to_id[label] for label in x.split(',')])
return df
file_path1 = '/kaggle/input/crfmedicaldataset/ruijin_train.data'
sentences1, labels1 = load_data(file_path1)
file_path2 = '/kaggle/input/crfmedicaldataset/ruijin_dev.data'
sentences2, labels2 = load_data(file_path2)
train_df = construct_dataframe(sentences1, labels1, label_to_id)
test_df = construct_dataframe(sentences2, labels2, label_to_id)
2.转换成dataloader
代码如下(示例):
class NERDataset(Dataset):
def __init__(self, df, tokenizer, max_len):
self.sentences = df['Sentence'].values
self.labels = df['Labels'].values
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.sentences)
def __getitem__(self, index):
sentence = self.sentences[index]
labels = self.labels[index]
# Tokenize the sentence
encoding = self.tokenizer(
sentence.split(),
is_split_into_words=True,
truncation=True,
padding='max_length',
max_length=self.max_len,
return_tensors='pt'
)
labels = [0] + labels + [0] * (self.max_len -1 - len(labels)) # Pad with 0
labels = labels[:self.max_len] # Truncate to max_len
labels = torch.tensor(labels, dtype=torch.long)
item = {key: val.squeeze(0) for key, val in encoding.items()}
item['labels'] = labels
return item
# 配置
PRETRAINED_MODEL_NAME = "bert-base-chinese"
MAX_LEN = 64
BATCH_SIZE = 64
# 初始化tokenizer
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
# 构建Dataset
train_dataset = NERDataset(train_df, tokenizer, MAX_LEN)
test_dataset = NERDataset(test_df, tokenizer, MAX_LEN)
# 构建DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)
3.构建模型(最简单的bert)
代码如下(示例):
class BERT_NER(torch.nn.Module):
def __init__(self,bertModel):
super(BERT_NER, self).__init__()
self.bert = bertModel
self.dropout = nn.Dropout(0.1)
self.linear = nn.Linear(768, 31)
def forward(self, input_ids, attention_mask):
outputs = self.bert(input_ids, attention_mask)
sequence_output = self.dropout(outputs[0])
emissions = self.linear(sequence_output)
return emissions
model = BERT_NER(bertModel)
model = model.to(device)
4.训练模型
代码如下(示例):
#对计算结果和label变形,并且移除pad
def reshape_and_remove_pad(outs, labels, attention_mask):
#变形,便于计算loss
outs = outs.reshape(-1, 31)
labels = labels.reshape(-1)
#忽略对pad的计算结果
#[b, lens] -> [b*lens - pad]
select = attention_mask.reshape(-1) == 1
outs = outs[select]
labels = labels[select]
return outs, labels
your_labels_list = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
optimizer = AdamW(model.parameters(), lr=3e-5)
epochs = 3
total_steps = len(train_dataloader) * epochs
loss_function = torch.nn.CrossEntropyLoss()
all_big_idx = np.array([])
all_targets = np.array([])
for epoch in range(epochs):
model.train()
total_loss = 0
for _,data in enumerate(train_dataloader, 0):
batch_input_ids = data['input_ids'].to(device, dtype = torch.long)
batch_input_mask = data['attention_mask'].to(device, dtype = torch.long)
batch_labels = data['labels'].to(device, dtype = torch.long)
out= model(batch_input_ids, attention_mask=batch_input_mask)
out, targets = reshape_and_remove_pad(out, batch_labels,batch_input_mask)
big_val, big_idx = torch.max(out.data, dim=1)
loss = loss_function(out, targets)
total_loss += loss.item()
optimizer.zero_grad()
loss.backward()
optimizer.step()
big_idx = big_idx.cpu()
big_idx = big_idx.detach().numpy()
all_big_idx = np.concatenate((all_big_idx,big_idx))
targets = targets.cpu()
targets = targets.detach().numpy()
all_targets = np.concatenate((all_targets,targets))
if (_+1) %1000 == 0:
precision = precision_score(all_big_idx, all_targets, labels=your_labels_list, average='weighted')
recall = recall_score(all_big_idx, all_targets, labels=your_labels_list, average='weighted')
f = f1_score(all_big_idx, all_targets, labels=your_labels_list, average='weighted')
print(f) #打印结果
avg_train_loss = total_loss / len(train_dataloader)
print(f"Average train loss: {avg_train_loss}")
if (_+1) %1000 == 0: # 查看部分预测与实际值
print(big_idx)
print(targets)
5.验证模型
代码如下(示例):
model.eval()
total_loss = 0
all_big_idx = np.array([])
all_targets = np.array([])
for _,data in enumerate(test_dataloader, 0):
batch_input_ids = data['input_ids'].to(device, dtype = torch.long)
batch_input_mask = data['attention_mask'].to(device, dtype = torch.long)
batch_labels = data['labels'].to(device, dtype = torch.long)
out= model(batch_input_ids, attention_mask=batch_input_mask)
out, targets = reshape_and_remove_pad(out, batch_labels,batch_input_mask)
big_val, big_idx = torch.max(out.data, dim=1)
loss = loss_function(out, targets)
total_loss += loss.item()
big_idx = big_idx.cpu()
big_idx = big_idx.detach().numpy()
all_big_idx = np.concatenate((all_big_idx,big_idx))
targets = targets.cpu()
targets = targets.detach().numpy()
all_targets = np.concatenate((all_targets,targets))
if (_+1) %1000 == 0:
precision = precision_score(all_big_idx, all_targets, labels=your_labels_list, average='weighted')
recall = recall_score(all_big_idx, all_targets, labels=your_labels_list, average='weighted')
f = f1_score(all_big_idx, all_targets, labels=your_labels_list, average='weighted')
print(f)
avg_train_loss = total_loss / len(train_dataloader)
print(f"Average train loss: {avg_train_loss}")
if (_+1) %1000 == 0:
print(big_idx)
print(targets)
总结
相当于用Bert做分类任务,针对的是每个token分类,下一步准备加入CRF,并把模型bert换成更新的。