BiLSTM进行NER

最新推荐文章于 2024-05-17 16:16:55 发布

ldon

最新推荐文章于 2024-05-17 16:16:55 发布

阅读量1.7k

点赞数 1

分类专栏：算法深度学习

本文链接：https://blog.csdn.net/sinat_27935693/article/details/83901520

版权

深度学习同时被 2 个专栏收录

5 篇文章 0 订阅

订阅专栏

算法

4 篇文章 0 订阅

订阅专栏

BiLSTM进行NER

功能函数
模型建立
- 预测函数
- 训练

功能函数

读取数据

def read_data(file_path):
    tokens = []
    tags = []
    tweet_tokens = []
    tweet_tags = []
    for line in open(file_path, encoding='utf-8'):
        line = line.strip()
        if not line:
            if tweet_tokens:
                tokens.append(tweet_tokens)
                tags.append(tweet_tags)
            tweet_tokens = []
            tweet_tags = []
        else:
            token, tag = line.split()
            # Replace all urls with <URL> token
            # Replace all users with <USR> token
            if token.startswith('http://') or token.startswith('https://'):
                token = '<USR>'
            if token.startswith('@'):
                token = '<URL>'        
            tweet_tokens.append(token)
            tweet_tags.append(tag)
            
    return tokens, tags

建立字典

from collections import defaultdict
def build_dict(tokens_or_tags, special_tokens):
    """
        tokens_or_tags: a list of lists of tokens or tags
        special_tokens: some special tokens
    """
    # Create a dictionary with default value 0
    tok2idx = defaultdict(lambda: 0)
    idx2tok = []
    idx = 0
    for token in special_tokens:
        idx2tok.append(token)
        tok2idx[token] = idx
        idx += 1

    for token_list in tokens_or_tags:
        for token in token_list:
            if token not in tok2idx:
                idx2tok.append(token)
                tok2idx[token] = idx
                idx += 1    
    return tok2idx, idx2tok
def words2idxs(tokens_list):
    return [token2idx[word] for word in tokens_list]

def tags2idxs(tags_list):
    return [tag2idx[tag] for tag in tags_list]

def idxs2words(idxs):
    return [idx2token[idx] for idx in idxs]

def idxs2tags(idxs):
    return [idx2tag[idx] for idx in idxs]

训练数据生成器

def batches_generator(batch_size, tokens, tags, shuffle=True, allow_smaller_last_batch=True):
    """Generates padded batches of tokens and tags."""
    n_samples = len(tokens)
    if shuffle:
        order = np.random.permutation(n_samples)
    else:
        order = np.arange(n_samples)

    n_batches = n_samples // batch_size
    if allow_smaller_last_batch and n_samples % batch_size:
        n_batches += 1

    for k in range(n_batches):
        batch_start = k * batch_size
        batch_end = min((k + 1) * batch_size, n_samples)
        current_batch_size = batch_end - batch_start
        x_list = []
        y_list = []
        max_len_token = 0
        for idx in order[batch_start: batch_end]:
            x_list.append(words2idxs(tokens[idx]))
            y_list.append(tags2idxs(tags[idx]))
            max_len_token = max(max_len_token, len(tags[idx]))
            
        # Fill in the data into numpy nd-arrays filled with padding indices.
        x = np.ones([current_batch_size, max_len_token], dtype=np.int32) * token2idx['<PAD>']
        y = np.ones([current_batch_size, max_len_token], dtype=np.int32) * tag2idx['O']
        lengths = np.zeros(current_batch_size, dtype=np.int32)
        for n in range(current_batch_size):
            utt_len = len(x_list[n])
            x[n, :utt_len] = x_list[n]
            lengths[n] = utt_len
            y[n, :utt_len] = y_list[n]
        yield x, y, lengths

模型建立

import tensorflow as tf
import numpy as np
import keras
config =  tf.ConfigProto()
config.gpu_options.allow_growth=True
from tensorflow.contrib.rnn import LSTMCell
class BiLSTMModel():
    def __init_(self, vocabulary_size, n_tags, embedding_dim, n_hidden_rnn, PAD_index):
	    self.__declare_placeholders()
	    self.__build_layers(vocabulary_size, embedding_dim, n_hidden_rnn, n_tags)
	    self.__compute_predictions()
	    self.__compute_loss(n_tags, PAD_index)
	    self.__perform_optimization()
   
    def declare_placeholders(self):
    	"""Specifies placeholders for the model."""
	    self.input_batch = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_batch') 
	    self.ground_truth_tags = tf.placeholder(dtype=tf.int32, shape=[None, None], name='ground_truth_tags')
	    self.lengths = tf.placeholder(dtype=tf.int32, shape=[None], name='lengths') 
	    self.dropout_ph = tf.placeholder_with_default(tf.cast(1.0, tf.float32), shape=[], name='dropout_ph')
	    self.learning_rate_ph = tf.placeholder_with_default(tf.cast(0.0001,tf.float32),shape = [], name='lr')
	    
    def build_layers(self, vocabulary_size, embedding_dim, n_hidden_rnn, n_tags):
	    """Specifies bi-LSTM architecture and computes logits for inputs."""
	    initial_embedding_matrix = np.random.randn(vocabulary_size, embedding_dim) / np.sqrt(embedding_dim)
	    embedding_matrix_variable =tf.Variable(initial_value = initial_embedding_matrix, dtype = 'float32', name = 'embedding_matrix') 
	    forward_cell =  tf.nn.rnn_cell.DropoutWrapper(
	        tf.nn.rnn_cell.BasicLSTMCell(num_units=n_hidden_rnn, forget_bias=1.0),
	        input_keep_prob=self.dropout_ph,
	        output_keep_prob=self.dropout_ph,
	        state_keep_prob=self.dropout_ph)
	
	    backward_cell =  tf.nn.rnn_cell.DropoutWrapper(
	        tf.nn.rnn_cell.BasicLSTMCell(num_units=n_hidden_rnn, forget_bias=1.0),
	        input_keep_prob=self.dropout_ph,
	        output_keep_prob=self.dropout_ph,
	        state_keep_prob=self.dropout_ph)

	    embeddings = tf.nn.embedding_lookup(embedding_matrix_variable,self.input_batch)
	    (rnn_output_fw, rnn_output_bw), _ =  tf.nn.bidirectional_dynamic_rnn(cell_fw = forward_cell, 
	                                                                         cell_bw = backward_cell, 
	                                                                         dtype = tf.float32, 
	                                                                         inputs = embeddings, 
	                                                                         sequence_length = self.lengths)
	    rnn_output = tf.concat([rnn_output_fw, rnn_output_bw], axis=2)
	    self.logits = tf.layers.dense(rnn_output, n_tags, activation=None)
    
    def compute_predictions(self):
    	softmax_output =  tf.nn.softmax(self.logits) 
		self.predictions = tf.argmax(softmax_output, axis = -1)
    
    def compute_loss(self, n_tags, PAD_index):
	    ground_truth_tags_one_hot = tf.one_hot(self.ground_truth_tags, n_tags)
	    loss_tensor =  tf.nn.softmax_cross_entropy_with_logits(labels = ground_truth_tags_one_hot, logits = self.logits)
	    mask = tf.cast(tf.not_equal(self.input_batch, PAD_index), tf.float32)
	    self.loss = tf.reduce_mean(tf.reduce_sum(mask * loss_tensor, axis = -1)/tf.reduce_sum(mask,axis = -1))
	    
    def perform_optimization(self):
	    """Specifies the optimizer and train_op for the model."""
	    self.optimizer =  tf.train.AdamOptimizer(self.learning_rate_ph)######### YOUR CODE HERE #############
	    self.grads_and_vars = self.optimizer.compute_gradients(self.loss)
	    clip_norm = tf.cast(1.0, tf.float32)
	    self.grads_and_vars =  [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in self.grads_and_vars]
	    self.train_op = self.optimizer.apply_gradients(self.grads_and_vars)
	    def train_on_batch(self, session, x_batch, y_batch, lengths, learning_rate, dropout_keep_probability):
	    feed_dict = {self.input_batch: x_batch,
	                 self.ground_truth_tags: y_batch,
	                 self.learning_rate_ph: learning_rate,
	                 self.dropout_ph: dropout_keep_probability,
	                 self.lengths: lengths}
	    
	    session.run(self.train_op, feed_dict=feed_dict)
    
    def predict_for_batch(self, session, x_batch, lengths):
    	feed_dict = { self.input_batch:x_batch,
                  	self.lengths : lengths}

    	predictions = session.run(self.predictions, feed_dict = feed_dict)
    return predictions

预测函数

from evaluation import precision_recall_f1
def predict_tags(model, session, token_idxs_batch, lengths):
"""Performs predictions and transforms indices to tokens and tags."""

tag_idxs_batch = model.predict_for_batch(session, token_idxs_batch, lengths)

tags_batch, tokens_batch = [], []
for tag_idxs, token_idxs in zip(tag_idxs_batch, token_idxs_batch):
    tags, tokens = [], []
    for tag_idx, token_idx in zip(tag_idxs, token_idxs):
        #print(tag_idx)
        tags.append(idx2tag[tag_idx])
        tokens.append(idx2token[token_idx])
    tags_batch.append(tags)
    tokens_batch.append(tokens)
 	return tags_batch, tokens_batch

def eval_conll(model, session, tokens, tags, short_report=True):
    """Computes NER quality measures using CONLL shared task script."""
    y_true, y_pred = [], []
    for x_batch, y_batch, lengths in batches_generator(1, tokens, tags):
        tags_batch, tokens_batch = predict_tags(model, session, x_batch, lengths)
        if len(x_batch[0]) != len(tags_batch[0]):
            raise Exception("Incorrect length of prediction for the input, "
                            "expected length: %i, got: %i" % (len(x_batch[0]), len(tags_batch[0])))
        predicted_tags = []
        ground_truth_tags = []
        for gt_tag_idx, pred_tag, token in zip(y_batch[0], tags_batch[0], tokens_batch[0]): 
            if token != '<PAD>':
                ground_truth_tags.append(idx2tag[gt_tag_idx])
                predicted_tags.append(pred_tag)

        # We extend every prediction and ground truth sequence with 'O' tag
        # to indicate a possible end of entity.
        y_true.extend(ground_truth_tags + ['O'])
        y_pred.extend(predicted_tags + ['O'])
        
    	results = precision_recall_f1(y_true, y_pred, print_results=True, short_report=short_report)
  	  	return results

训练

tf.reset_default_graph()

model = BiLSTMModel(20505, 21, 200, 200, token2idx['<PAD>'])######### YOUR CODE HERE #############

batch_size = 32######### YOUR CODE HERE #############
n_epochs = 4######### YOUR CODE HERE #############
learning_rate = 0.005######### YOUR CODE HERE #############
learning_rate_decay = 1.414######### YOUR CODE HERE #############
dropout_keep_probability = 0.5######### YOUR CODE HERE #############

sess = tf.Session(config=config)
sess.run(tf.global_variables_initializer())

print('Start training... \n')
for epoch in range(n_epochs):
    # For each epoch evaluate the model on train and validation data
    print('-' * 20 + ' Epoch {} '.format(epoch+1) + 'of {} '.format(n_epochs) + '-' * 20)
    print('Train data evaluation:')
    eval_conll(model, sess, train_tokens, train_tags, short_report=True)
    print('Validation data evaluation:')
    eval_conll(model, sess, validation_tokens, validation_tags, short_report=True)
    
    # Train the model
    for x_batch, y_batch, lengths in batches_generator(batch_size, train_tokens, train_tags):
        model.train_on_batch(sess, x_batch, y_batch, lengths, learning_rate, dropout_keep_probability)
        
    # Decaying the learning rate
    learning_rate = learning_rate / learning_rate_decay
    
print('...training finished.')

print('-' * 20 + ' Train set quality: ' + '-' * 20)
train_results = eval_conll(model, sess, train_tokens, train_tags, short_report=False)

print('-' * 20 + ' Validation set quality: ' + '-' * 20)
validation_results = eval_conll(model,sess,validation_tokens, validation_tags, short_report =False)######### YOUR CODE HERE #############

print('-' * 20 + ' Test set quality: ' + '-' * 20)
test_results = eval_conll(model,sess,test_tokens, test_tags,short_report=False)######### YOUR CODE HERE #############

ldon

关注

1
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
BiLSTM进行NER

BiLSTM进行NER功能函数读取数据建立字典训练数据生成器模型建立预测函数训练功能函数读取数据def read_data(file_path): tokens = [] tags = [] tweet_tokens = [] tweet_tags = [] for line in open(file_path, encoding='utf-8'):...
复制链接

扫一扫