说明
模型:BiLSTM+CRF
框架:tensorflow 1.14.0
硬件:RTX 2080Ti (11G)
其他:采用BIO 标注的方式
本人在NER方面没有实际经验,代码有不当之处 请各位看官指正
代码
目录如下:
- NER
- dataset
- labels.txt
- train_data
- models
- datautil.py
- ner.py
- train.py
- dataset
ner.py
# -*- coding:utf-8 -*-
import os
from tqdm import tqdm
import tensorflow as tf
from tensorflow.contrib import crf
# 设置输出级别
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
def get_key(kwargs: dict, key: str, default):
"""
获取key的值
:param kwargs:
:param key:
:param default:
:return:
"""
if key in kwargs.keys():
return kwargs[key]
else:
return default
class BasicModel(object):
def _build_model(self):
...
def _load_model(self):
...
class NerModel(BasicModel):
"""
Ner模型
主要基于BiLSTM + CRF,可以选择不添加CRF进行训练
"""
def __init__(self, **kwargs):
"""
初始化NerModel
:param kwargs:
"""
self._is_train = get_key(kwargs, 'is_train', False) # 是否是训练过程
self._model_save_path = get_key(kwargs, 'model_save_path', './models/ner_model/nermodel.ckpt') # 模型保存位置
self._allow_soft_placement = get_key(kwargs, 'allow_soft_placement', True) # 是否占用GPU
self._log_device_placement = get_key(kwargs, 'log_device_placement', False) # 是否在设备显示Log
self._vocab_size = get_key(kwargs, 'vocab_size', 16000) # 词库大小
self._percent_gpu = get_key(kwargs, 'percent_gpu', 0.5) # 占用GPU显存的比例
config_proto = tf.ConfigProto(allow_soft_placement=self._allow_soft_placement,
log_device_placement=self._log_device_placement)
config_proto.gpu_options.per_process_gpu_memory_fraction = self._percent_gpu
self._use_crf = get_key(kwargs, 'use_crf', True) # 是否使用CRF层
self._sess = tf.Session(config=config_proto)
if self._is_train:
self._max_seq_len = get_key(kwargs, 'max_seq_len', 128) # 最大长度
self._layer_num = get_key(kwargs, 'layer_num', 2) # lstm层数
self._layer_units = get_key(kwargs, 'layer_units', 256) # lstm隐层大小
self._label_num = get_key(kwargs, 'label_num', 17) # 类别集合大小
self._embedding_size = get_key(kwargs, 'embedding_size', 200) # 类别集合大小
self._cell_type = get_key(kwargs, 'cell_type', 'lstm') # 类别集合大小
self._build_model()
else:
# 非训练过程,则会读入模型
self._load_model()
def _build_model(self):
"""
构建模型
:return:
"""
# 输入数据
self._input_x = tf.placeholder(shape=[None, self._max_seq_len], dtype=tf.int32, name='input_x')
self._input_y = tf.placeholder(shape=[None, self._max_seq_len], dtype=tf.int32, name='input_y')
# 每个序列的真实长度
self._input_seq_len = tf.placeholder(shape=[None], dtype=tf.int32, name='input_seq_len')
self._dropout_prob = tf.placeholder(shape=None, dtype=tf.float32, name='dropout')
# 嵌入层
with tf.variable_scope('embedding'):
embedding = tf.get_variable(initializer=tf.truncated_normal_initializer(stddev=0.1),
shape=[self._vocab_size, self._embedding_size], name='W')
embedding_x = tf.nn.embedding_lookup(embedding, self._input_x, name='x')
with tf.variable_scope('feature'):
fw_cells =