简单的ner代码

数据预处理

from collections import defaultdict
from operator import itemgetter
from tqdm import tqdm
import numpy as np
import random
import torch 
import jieba
import json
import os

import pickle as pk

from transformers import WEIGHTS_NAME, BertConfig,get_linear_schedule_with_warmup,AdamW, BertTokenizer
tokenizer_class = BertTokenizer
tokenizer = tokenizer_class.from_pretrained("prev_trained_model")


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# 确定模型训练方式,GPU训练或CPU训练
parameter_copy = {
    # 此处embedding维度为768
    'd_model':768, 
    # rnn的隐层维度为300
    'hid_dim':300,
    # 训练的批次为100轮
    'epoch':2,
    # 单次训练的batch_size为100条数据
    'batch_size':50,
    # 设置两个lstm,原文应该是一个
    'n_layers':2,
    # 设置dropout,为防止过拟合
    'dropout':0.1,
    # 配置cpu、gpu
    'device':device,
    # 设置训练学习率
    'lr':0.001,
    # 优化器的参数,动量主要用于随机梯度下降
    'momentum':0.99,
}



def build_dataSet(parameter):
    data_name = ['train','dev']
    # 准备相应的字典
    data_set = {}
    key_table = defaultdict(int)
    vocab_table = defaultdict(int)
    # 预先准备相应的标志位
    vocab_table['<PAD>'] = 0
    vocab_table['<UNK>'] = 0
    # 数据内容可以参考data文件夹下的README,基于CLUENER 数据进行处理
    # 因为有两份数据,dev和train,因为构建时候同时进行构建
    for i in data_name: # dev和train
        data_set[i] = []
        data_src = open('data/'+i+'.json','r',encoding = 'utf-8').readlines()

        for data in data_src: # 每一行
            
            print(f"data1---{data}")
            '''{"text": "浙商银行企业信贷部叶老桂博士则从另一个角度对五道门槛进行了解读。叶老桂认为,对目前国内商业银行而言,", 
            "label": {"name": {"叶老桂": [[9, 11]]}, "company": {"浙商银行": [[0, 3]]}}}'''
            
            # 加载相应的数据
            data = json.loads(data) # 每一行把json转文本

            print(f"data2---{data}")
            '''{'text': '浙商银行企业信贷部叶老桂博士则从另一个角度对五道门槛进行了解读。叶老桂认为,对目前国内商业银行而言,', 
            'label': {'name': {'叶老桂': [[9, 11]]}, 'company': {'浙商银行': [[0, 3]]}}}'''

            # 获取对应的文本和标签
            text = list(data['text'])
            label = data['label']

            print(f"text---{text}")
            '''['浙', '商', '银', '行', '企', '业', '信', '贷', '部', '叶', '老', '桂', '博', '士', '则', '从', '另', '一', '个', '角
            ', '度', '对', '五', '道', '门', '槛', '进', '行', '了', '解', '读', '。', '叶', '老', '桂', '认', '为', ',', '对', '目',
             '前', '国', '内', '商', '业', '银', '行', '而', '言', ',']'''

            print(f"label---{label}")

            '''{'name': {'叶老桂': [[9, 11]]}, 'company': {'浙商银行': [[0, 3]]}}'''
            # 初始化标准ner标签
            label_new = ['O']*len(text)
            key_table['O']

            #  label为 {"company": {"工行": [[9, 10]] ,"农行": [[11, 13]]}, "position": {"大学生": [[15, 17]]}}}
            # 根据其所带有的标签,如game、address进行数据提取
            for keys in label: # 代表某一行数据有多个类型标签 "company" "position"
                inds = label[keys].values() # 取某个类型标签的索引 [[9, 10]], [[11, 13]]
                print(f"keys---{keys}") # "company"
                # 因为其标签下的数据是一个数组,代表这类型标签的数据有多个
                # 因此循环处理,其keys(文本内容),可以通过id索引到

                for id_list in inds: # 取某个类型标签中的一个实体
                    print(f"id_list---{id_list}") # [[9, 10]]
                    for ind in id_list: # 如"工行"这个实体在该行中出现不止一次
                        print(f"ind---{ind}") # [9, 10]

                        if ind[1] - ind[0] == 0:
                            # 当id号相同,表明这个实体只有一个字,
                            # 那么他的标签为'S-'+对应的字段
                            keys_list = ['S-'+keys]
                            label_new[ind[0]] = keys_list[0] # 'S-'+keys

                        if ind[1] - ind[0] == 1:
                            # 如果id号相差,仅为1,表明这个实体有两个字
                            # 那么他的标签为 B-*,E-*,表明开始和结束的位置
                            keys_list = ['B-'+keys,'E-'+keys]
                            label_new[ind[0]] = keys_list[0] # 'B-'+keys
                            label_new[ind[1]] = keys_list[1] # 'E-'+key

                        if ind[1] - ind[0] > 1:
                            # 如果id号相差,大于1,表明这个实体有多个字
                            # 那么他的标签除了 B-*,E-*,表明开始和结束的位置
                            # 还应该有I-*,来表明中间的位置
                            keys_list = ['B-'+keys, 'I-'+keys, 'E-'+keys]

                            label_new[ind[0]] = keys_list[0] # 'B-'+keys
                            label_new[ind[0]+1:ind[1]] = [keys_list[1]]*(ind[1]-1-ind[0]) # ('I-'+keys) * n
                            label_new[ind[1]] = keys_list[2] # 'E-'+keys
                        for key in keys_list:
                            # 为了后面标签转id,提前准好相应的字典
                            key_table[key] += 1 # 计数 统计词频
            # 此处用于构建文本的字典 text=list(data['text'])
            for j in text:
                vocab_table[j] += 1 # 统计词频
            # 保存文本和处理好的标签 i = ['train','dev']
            print(f"text---{text}")
            print(f"label_new---{label_new}")
            data_set[i].append([text, label_new])


def batch_yield_bert(parameter,shuffle = True,isTrain = True):
    # 构建数据迭代器
    # 根据训练状态或非训练状态获取相应数据
    data_set = parameter['data_set']['train'] if isTrain else parameter['data_set']['dev']
    Epoch = parameter['epoch'] if isTrain else 1
    for epoch in range(Epoch):
        # 每轮对原始数据进行随机化
        if shuffle:
            random.shuffle(data_set)
        inputs,targets = [],[]
        max_len = 0
        for items in tqdm(data_set):
            # 基于所构建的字典,将原始文本转成id,进行多分类
            # 此处和bilstm处不一致,使用bert自带字典
            input = tokenizer.convert_tokens_to_ids(items[0]) # itemgetter(*items[0])(parameter['word2ind'])
            target = itemgetter(*items[1])(parameter['key2ind'])
            target = target if type(target) == type(()) else (target,0)
            if len(input) > max_len:
                max_len = len(input)
            inputs.append(list(input))
            targets.append(list(target))
            if len(inputs) >= parameter['batch_size']:
                # 填空补齐
                inputs = [i+[0]*(max_len-len(i)) for i in inputs]
                targets = [i+[0]*(max_len-len(i)) for i in targets]
                yield list2torch(inputs),list2torch(targets),None,False
                inputs,targets = [],[]
                max_len = 0
        inputs = [i+[0]*(max_len-len(i)) for i in inputs]
        targets = [i+[0]*(max_len-len(i)) for i in targets]
        yield list2torch(inputs),list2torch(targets),epoch,False
        inputs,targets = [],[]
        max_len = 0
    yield None,None,None,True
            

def list2torch(ins):
    return torch.from_numpy(np.array(ins)).long().to(parameter['device'])

# 因此这边提前配置好用于训练的相关参数
# 不要每次重新生成
if not os.path.exists('parameter.pkl'):
    parameter = parameter_copy
    # 构建相关字典和对应的数据集
    parameter = build_dataSet(parameter)
    pk.dump(parameter,open('parameter.pkl','wb'))
else:
    # 读取已经处理好的parameter,但是考虑到模型训练的参数会发生变化,
    # 因此此处对于parameter中模型训练参数进行替换
    parameter = pk.load(open('parameter.pkl','rb'))
    for i in parameter_copy.keys():
        if i not in parameter:
            parameter[i] = parameter_copy[i]
            continue
        if parameter_copy[i] != parameter[i]:
            parameter[i] = parameter_copy[i]
    for i in parameter_copy.keys():
        print(i,':',parameter[i])
    pk.dump(parameter,open('parameter.pkl','wb'))
    del parameter_copy,i
    

基于预训练bert模型

from transformers import WEIGHTS_NAME, BertConfig,get_linear_schedule_with_warmup,AdamW, BertTokenizer
from transformers import BertModel,BertPreTrainedModel
from torch.nn import CrossEntropyLoss
import torch.nn as nn
import torch
from torchcrf import CRF

import torch.nn.functional as F # pytorch 激活函数的类
from torch import nn,optim # 构建模型和优化器

# 方法与bert没有什么区别,只是加上了CRF进行处理
# 构建基于bert+crf实现ner
class bert_crf(BertPreTrainedModel):
    def __init__(self, config,parameter):
        super(bert_crf, self).__init__(config)
        self.num_labels = config.num_labels
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        embedding_dim = parameter['d_model']
        output_size = parameter['output_size']
        self.fc = nn.Linear(embedding_dim, output_size)
        self.init_weights()
        
        self.crf = CRF(output_size,batch_first=True)
        
    def forward(self, input_ids, attention_mask=None, token_type_ids=None,labels=None):
        outputs = self.bert(input_ids = input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.fc(sequence_output)
        return logits
    
config_class, bert_crf, tokenizer_class = BertConfig, bert_crf, BertTokenizer
config = config_class.from_pretrained("prev_trained_model")
tokenizer = tokenizer_class.from_pretrained("prev_trained_model")

微调预训练 

import os
import shutil
import pickle as pk
from torch.utils.tensorboard import SummaryWriter

random.seed(2019)

# 构建模型
model = bert_crf.from_pretrained("prev_trained_model",config=config,parameter = parameter).to(parameter['device'])

# 确定训练权重
full_finetuning = True
if full_finetuning:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] # 不优化
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 
             'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 
             'weight_decay': 0.0}
        ]
else: 
        param_optimizer = list(model.fc.named_parameters()) 
        optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer]}]

# 确定训练的优化器和学习策略
# AdamW是在Adam+L2正则化的基础上进行改进的算法
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5, correct_bias=False)
train_steps_per_epoch = 10748 // parameter['batch_size'] # 10748为样本数
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=train_steps_per_epoch, num_training_steps=parameter['epoch'] * train_steps_per_epoch)


# 确定训练模式
model.train()

# 确定损失 crf
# criterion = nn.CrossEntropyLoss(ignore_index=-1)


# 准备迭代器
train_yield = batch_yield_bert(parameter)

# 开始训练
loss_cal = []
min_loss = float('inf')
logging_steps = 0
while 1:
        inputs,targets,epoch,keys = next(train_yield)
        if keys:
            break
        out = model(inputs)
        # 同样crf被用于损失
        loss = -model.crf(out, targets)
        optimizer.zero_grad()
        loss.backward()
        # 适当梯度修饰
        nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=5)
        # 优化器和学习策略更新
        optimizer.step()
        scheduler.step()
        
        loss_cal.append(loss.item())
        logging_steps += 1
        if logging_steps%100 == 0:
            print(sum(loss_cal)/len(loss_cal))
        if epoch is not None:
            if (epoch+1)%1 == 0:
                loss_cal = sum(loss_cal)/len(loss_cal)
                if loss_cal < min_loss:
                    min_loss = loss_cal
                    torch.save(model.state_dict(), 'bert_crf.h5')
                print(f'epoch [{epoch+1}/{parameter["epoch"]}], Loss: {loss_cal:.4f}')
            loss_cal = [loss.item()]

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
下面是一个基于PyTorch的NER任务代码示例,其中使用了BiLSTM-CRF模型: ```python import torch import torch.nn as nn import torch.optim as optim class BiLSTM_CRF(nn.Module): def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim): super(BiLSTM_CRF, self).__init__() self.embedding_dim = embedding_dim self.hidden_dim = hidden_dim self.vocab_size = vocab_size self.tag_to_ix = tag_to_ix self.tagset_size = len(tag_to_ix) self.word_embeddings = nn.Embedding(vocab_size, embedding_dim) self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=1, bidirectional=True) self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size) self.transitions = nn.Parameter( torch.randn(self.tagset_size, self.tagset_size)) self.transitions.data[tag_to_ix[START_TAG], :] = -10000 self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000 self.hidden = self.init_hidden() def init_hidden(self): return (torch.randn(2, 1, self.hidden_dim // 2), torch.randn(2, 1, self.hidden_dim // 2)) def _forward_alg(self, feats): init_alphas = torch.full((1, self.tagset_size), -10000.) init_alphas[0][self.tag_to_ix[START_TAG]] = 0. forward_var = init_alphas for feat in feats: alphas_t = [] for next_tag in range(self.tagset_size): emit_score = feat[next_tag].view( 1, -1).expand(1, self.tagset_size) trans_score = self.transitions[next_tag].view(1, -1) next_tag_var = forward_var + trans_score + emit_score alphas_t.append(torch.logsumexp(next_tag_var, dim=1).view(1)) forward_var = torch.cat(alphas_t).view(1, -1) terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]] alpha = torch.logsumexp(terminal_var, dim=1)[0] return alpha def _get_lstm_features(self, sentence): self.hidden = self.init_hidden() embeds = self.word_embeddings(sentence).view(len(sentence), 1, -1) lstm_out, self.hidden = self.lstm(embeds, self.hidden) lstm_out = lstm_out.view(len(sentence), self.hidden_dim) lstm_feats = self.hidden2tag(lstm_out) return lstm_feats def _score_sentence(self, feats, tags): score = torch.zeros(1) tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags]) for i, feat in enumerate(feats): score = score + \ self.transitions[tags[i+1], tags[i]] + feat[tags[i+1]] score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]] return score def _viterbi_decode(self, feats): backpointers = [] init_vvars = torch.full((1, self.tagset_size), -10000.) init_vvars[0][self.tag_to_ix[START_TAG]] = 0 forward_var = init_vvars for feat in feats: bptrs_t = [] viterbivars_t = [] for next_tag in range(self.tagset_size): next_tag_var = forward_var + self.transitions[next_tag] best_tag_id = argmax(next_tag_var) bptrs_t.append(best_tag_id) viterbivars_t.append(next_tag_var[0][best_tag_id].view(1)) forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1) backpointers.append(bptrs_t) terminal_var = forward_var + \ self.transitions[self.tag_to_ix[STOP_TAG]] best_tag_id = argmax(terminal_var) path_score = terminal_var[0][best_tag_id] best_path = [best_tag_id] for bptrs_t in reversed(backpointers): best_tag_id = bptrs_t[best_tag_id] best_path.append(best_tag_id) start = best_path.pop() assert start == self.tag_to_ix[START_TAG] best_path.reverse() return path_score, best_path def neg_log_likelihood(self, sentence, tags): lstm_feats = self._get_lstm_features(sentence) forward_score = self._forward_alg(lstm_feats) gold_score = self._score_sentence(lstm_feats, tags) return forward_score - gold_score def forward(self, sentence): lstm_feats = self._get_lstm_features(sentence) score, tag_seq = self._viterbi_decode(lstm_feats) return score, tag_seq START_TAG = "<START>" STOP_TAG = "<STOP>" EMBEDDING_DIM = 5 HIDDEN_DIM = 4 training_data = [( "the wall street journal reported today that apple corporation made money".split(), "B I I I O O O B I O O".split() ), ( "georgia tech is a university in georgia".split(), "B I O O O O B".split() )] word_to_ix = {} for sentence, tags in training_data: for word in sentence: if word not in word_to_ix: word_to_ix[word] = len(word_to_ix) tag_to_ix = {"B": 0, "I": 1, "O": 2, START_TAG: 3, STOP_TAG: 4} model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM) optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4) for epoch in range(300): for sentence, tags in training_data: model.zero_grad() sentence_in = prepare_sequence(sentence, word_to_ix) targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long) loss = model.neg_log_likelihood(sentence_in, targets) loss.backward() optimizer.step() with torch.no_grad(): precheck_sent = prepare_sequence(training_data[0][0], word_to_ix) print(model(precheck_sent)) ``` 这份代码实现了一个简单NER任务的训练和预测,其中BiLSTM-CRF模型结合了BiLSTM和CRF两种技术,用于标注每个单词的实体类型。可以通过调整模型超参数和优化器参数来改进模型性能。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值