实体识别——识别交通工具、伴随人员、目标地点
‘
哈工大LTP
对实体识别还属于初学时期,稍微做做记录。一开始使用的是哈工大训练好的模型LTP。LTP功能很多,能做分词,词性标注,命名实体识别(人名,地名,机构名),依存句法分析,语义角色标注。
参考网址1
参考网址2
在安装过程,要先“$ pip install pyltp”,再下载训练好的pyltp的模型,详细见以上两个网址。在“$ pip install pyltp”过程中会很可能出现缺少Visual C++ 9.0的问题。查阅资料发现pyltp仅支持python3.6以及以下的版本,笔者Anoconda中默认的python版本是3.7的,经过无数尝试,最终还是装了多装了一个python3.6版本,即双版本。安装python3.6 安装python3.6的spyder
可是任务目标需要识别特别的实体,需要自己为数据设置标签,再训练。于是在同学的建议下,决定构建lstm+crf框架训练。
lstm+crf
通俗理解crf
进一步理解crf
crf相关论文
Bilism+crf参考项目1
Bilism+crf参考项目2
数据处理
数据是自己生成的。。
标签见label2idx
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 11 12:29:20 2020
@author: Lenovo
"""
import numpy as np
from keras.preprocessing.sequence import pad_sequences
import pickle
import platform
def increse_data2(path,pre_line):
i_datas=[]
i_labels=[]
with open(path) as f:
lines = f.readlines()
for line in lines:
#print(pre_line)
line0=["我"]
[link_,person,trans1,loc]=line.strip().split(',')
person=[w for w in person]
trans1=[w for w in trans1]
loc=[w for w in loc]
person_num=len(person)
trans_num=len(trans1)
loc_num=len(loc)
line0.append(link_)
for w in person:
line0.append(w)
for w in trans1:
line0.append(w)
line0.append("")
for w in loc:
line0.append(w)
line0.append("。")
i_datas.append(line0)
label0=["O","O-LINK"]#我和
label0.append("B-OPER")
for i in range(person_num-1):
label0.append("I-OPER")
label0.append("B-TRN")
for i in range(trans_num-1):
label0.append("I-TRN")
label0.append("O-LOCLINK")#到
label0.append("B-LOC")#公司
for i in range(loc_num-1):
label0.append("I-LOC")
label0.append("O")
i_labels.append(label0)
return i_datas,i_labels
def increse_data0(path,pre_line):
i_datas=[]
i_labels=[]
with open(path) as f:
lines = f.readlines()
for line in lines:
line0=pre_line[0:5]
[startst,endst]=line.strip().split(',')
startst=[w for w in startst]
endst=[w for w in endst]
startst_num=len(startst)
endst_num=len(endst)
for w in startst:
line0.append(w)
line0.append("到")
for w in endst:
line0.append(w)
line0.append("。")
#print(line0)
i_datas.append(line0)
##label
#我坐火车从广州南到深圳北。
label0=["O","B-TRN","I-TRN","I-TRN","O-LOCLINK"]
label0.append("B-LOC")
for i in range(startst_num-1):
label0.append("I-LOC")
label0.append("O-LOCLINK")#到
label0.append("B-LOC")
for i in range(endst_num-1):
label0.append("I-LOC")
label0.append("O")
i_labels.append(label0)
return i_datas,i_labels
def increse_data1(path,pre_line):
i_datas=[]
i_labels=[]
with open(path) as f:
lines = f.readlines()
for line in lines:
line0=["我"]
[startst,endst]=line.strip().split(',')
startst=[w for w in startst]
endst=[w for w in endst]
startst_num=len(startst)
endst_num=len(endst)
for w in startst:
line0.append(w)
line0.append("到")
for w in endst:
line0.append(w)
line0.append("。")
i_datas.append(line0)
##label
#我坐地铁到公司。
label0=["O"]#我
label0.append("B-TRN")
for i in range(startst_num-1):
label0.append("I-TRN")
label0.append("O-LOCLINK")#到
label0.append("B-LOC")#公司
for i in range(endst_num-1):
label0.append("I-LOC")
label0.append("O")
i_labels.append(label0)
return i_datas,i_labels
line0=["我","坐","火","车","从"]
i_datas1,i_labels1=increse_data0('station.txt',line0)
line0=["我","坐","地","铁","从"]
i_datas2,i_labels2=increse_data0('railway.txt',line0)
line0=["我","坐","飞","机","从"]
i_datas3,i_labels3=increse_data0('airport.txt',line0)
line1=["我"]
i_datas4,i_labels4=increse_data1('increase4.txt',line1)
line2=["我"]#我和我妈
i_datas5,i_labels5=increse_data2('increase5.txt',line2)
line1=["我"]
i_datas41,i_labels41=increse_data1('increase4.txt',line1)
train_datas=i_datas1+i_datas2+i_datas3+i_datas4+i_datas5+i_datas41
train_labels =i_labels1+i_labels2+i_labels3+i_labels4+i_labels5+i_labels41
#####保存数据######################
import pandas as pd
df=pd.DataFrame(train_datas)
df.to_excel("train_data1.xlsx")
df=pd.DataFrame(train_labels)
df.to_excel("train_label1.xlsx")
####重载自己的数据库###############
data = pd.read_excel("train_data1.xlsx",header=0,encoding="utf-8")
label= pd.read_excel("train_label1.xlsx",header=0,encoding="utf-8")
train_datas=[]
train_labels=[]
for i in range(len(data)):
line=data.iloc[i,]
temp=[]
for j in range(sum(line.notnull())-1):#line.notnull()返回每行的TRUE(由值),包括序号
w=data.iloc[i,j+1]
temp.append(w)
train_datas.append(temp)
for i in range(len(label)):
line=label.iloc[i,]
temp=[]
for j in range(sum(line.notnull())-1):
w=label.iloc[i,j+1]
temp.append(w)
train_labels.append(temp)
###处理成字典
from collections import Counter
#Counter({'a': 3, 'b': 0, 'c': -3, 'd': -6})
#word_counts = Counter(row[0].lower() for sample in train_datas for row in sample)#代写转小写
word_counts = Counter(row for sample in train_datas for row in sample)
vocab = [w for w, f in iter(word_counts.items()) if f >= 2]## word_counts.items()转化成(元素,计数值)组成的列表
np.save('vocab1.npy', vocab)
special_words = ['<PAD>', '<UNK>'] # 特殊词表示0,1
char_vocabs = special_words + vocab
# # "BIO"标记的标签。一级分类,B词首,I非词首,O单独成词
## “-”后是二级分类,LINK,OPER,LOCLINK,LOC
# #LINK OPER #和我妈
# #LOCLINK LOC #去医院
label2idx = {"O": 0,"O-LINK":1,#7:走,船。8:和
"B-LINK": 2, "I-LINK": 3,#9.10:还有
"B-OPER": 4 ,"I-OPER": 5,#和我妈
"O-LOCLINK":6,"B-LOCLINK":7,#去,去了
"I-LOCLINK":8,"O-LOC":9,
"B-LOC": 10, "I-LOC": 11,
"O-TRN": 12,
"B-TRN": 13, "I-TRN": 14,#开车
}
# 索引和BIO标签对应
idx2label = {idx: label for label, idx in label2idx.items()}
idx2vocab = {idx: char for idx, char in enumerate(char_vocabs)}
vocab2idx = {char: idx for idx, char in idx2vocab.items()}
####编号
sent=[]
for line in train_datas:
temp=[]
for w in line:
temp.append(vocab2idx[w] if w in vocab2idx else vocab2idx['<UNK>'])
sent.append(temp)
train_datas=sent
label_sent=[]
for line in train_labels:
temp=[]
for w in line:
temp.append(label2idx[w] if w in label2idx else 0)#0-"O"
label_sent.append(temp)
train_labels= label_sent
###划分训练集测试集
testn1=np.random.randint(len(train_datas),size=1000)
test_datas=(np.array(train_datas)[testn1]).tolist()
test_labels =(np.array(train_labels)[testn1]).tolist()
# train_datas=np.delete(np.array(train_datas),testn1).tolist()
# train_labels=np.delete(np.array(train_labels),testn1).tolist()
‘
’
BiLSTM+CRF模型构建1
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 11 13:08:31 2020
@author: Lenovo
"""
import numpy as np
from collections import Counter
from keras.preprocessing.sequence import pad_sequences
import pickle
import platform
# special_words = ['<PAD>', '<UNK>'] # 特殊词表示0,1
# # "BIO"标记的标签
# #LINK OPER #和我妈
# #LOCLINK LOC #去医院
# label2idx = {"O": 0,"O-LINK":1,#7:走,船。8:和
# "B-LINK": 2, "I-LINK": 3,#9.10:还有
# "B-OPER": 4 ,"I-OPER": 5,#和我妈
# "O-LOCLINK":6,"B-LOCLINK":7,#去,去了
# "I-LOCLINK":8,"O-LOC":9,
# "B-LOC": 10, "I-LOC": 11,
# "O-TRN": 12,
# "B-TRN": 13, "I-TRN": 14,#开车
# }
# 索引和BIO标签对应
# idx2label = {idx: label for label, idx in label2idx.items()}
# # 字符词典文件
# with open('vocab1.txt', "r", encoding="utf8") as fo:
# char_vocabs = [line.strip() for line in fo]
# char_vocabs = special_words + char_vocabs
# # 字符和索引编号对应
# idx2vocab = {idx: char for idx, char in enumerate(char_vocabs)}
# vocab2idx = {char: idx for idx, char in idx2vocab.items()}
# testn1=np.random.randint(len(i_datas1),size=2000)
#####加载数据
##train_datas
#train_labels
#
#test_datas=(np.array(train_datas)[testn1]).tolist()
#test_labels =(np.array(train_labels)[testn1]).tolist()
import keras
from keras.models import Sequential
from keras.models import Model
from keras.layers import Masking, Embedding, Bidirectional, LSTM, Dense, Input, TimeDistributed, Activation
from keras.preprocessing import sequence
from keras_contrib.layers import CRF
from keras_contrib.losses import crf_loss
from keras_contrib.metrics import crf_viterbi_accuracy
from keras import backend as K
K.clear_session()
EPOCHS = 20
BATCH_SIZE = 64
EMBED_DIM = 128#128/300 usually 是一个词周围词语的个数(左1-左5)
HIDDEN_SIZE =256
MAX_LEN = 100
VOCAB_SIZE = len(vocab2idx)
CLASS_NUMS = len(label2idx)
#print(VOCAB_SIZE, CLASS_NUMS)
#keras.preprocessing.sequence.pad_sequences(sequences, maxlen=None, dtype='int32',
#padding='pre', truncating='pre', value=0.)0-PAD
print('padding sequences')
train_datas = sequence.pad_sequences(train_datas, maxlen=MAX_LEN)
train_labels = sequence.pad_sequences(train_labels, maxlen=MAX_LEN)
test_datas = sequence.pad_sequences(test_datas, maxlen=MAX_LEN)
test_labels = sequence.pad_sequences(test_labels, maxlen=MAX_LEN)
print('x_train shape:', train_datas.shape)
print('x_test shape:', test_datas.shape)
train_labels = keras.utils.to_categorical(train_labels, CLASS_NUMS)
test_labels = keras.utils.to_categorical(test_labels, CLASS_NUMS)
print('trainlabels shape:', train_labels.shape)#三维(50658, 100, 10)
print('testlabels shape:', test_labels.shape)
## BiLSTM+CRF模型构建
#Input
#shape=一个尺寸元组(timesteps,input_dim词汇表大小),不包含批量batchsize大小,shape=(32,),表明期望的输入是按批量的 32 维向量。batch_shape=一个尺寸元组(batchsize,timestep),包含批量大小,batch_shape=(10, 32) 表明期望的输入是 10 个 32 维向量,batch_shape=(None, 32) 表明任意批次大小的 32 维向量。
#标签是三维的
#Masking:使目标时间步在所有下游层被覆盖 (跳过)。比如要覆盖时间步 #3 和 #5,因为你缺乏这几个时间步的数据,喂入一个 LSTM 层的 Numpy 矩阵 (samples, timesteps, features)。
#设置 x[:, 3, :] = 0, x[:, 5, :]=0.在 LSTM 层之前,插入一个 mask_value=0 的 Masking 层,本代码中0-PAD,就是没用的
#用了padding就要用Masking
#keras.layers.Embedding(input_dim词汇表大小, output_dim词向量的维度, embeddings_initializer='uniform',
#embeddings_regularizer=None, activity_regularizer=None,
#embeddings_constraint=None, mask_zero=False, input_length=None),mask_zero: 是否把 0 看作为一个应该被遮蔽的特殊的”padding” 值。这对于可变长的循环
#神经网络层十分有用。如果设定为 True,那么接下来的所有层都必须支持 masking,
#如果 mask_zero 为 True,索引0就不能被用于词汇表中(input_dim应该与 vocabulary + 1 大小相同)
#输入为 (batch_size, sequence_length) 的 2D 张量,输出为(batch_size, sequence_length, output_dim) 的 3D 张量。不需要one-hot
#embedding和onehot选其一
#lstm,rnn输入3D 张量,尺寸为 (batch_size, timesteps, input_dim)。
#输出:如果 return_state 为 True。则返回张量列表,第一个张量为输出。剩余的张量为最后的状态,每个张量的尺寸为 (batch_size, units)
#否则返回最后的状态,(batch_size, units) 的 2D 张量。
#stateful: 布尔值 (默认 False)。如果为 True,则批次中索引 i 处的每个样品的最后状态将用作下一批次中索引 i 样品的初始状态
#stateful=True,如果是顺序模型,为你的模型的第一层传递一个batch_input_shape=(...)。如果是带有 1 个或多个 Input 层的函数式模型,为你的模型的所有
#第一层传递一个 batch_shape=(...)。
#Bidirectional(rnn/lstm实例)
#Dense: units: 正整数,输出空间维度。activation: 激活函数,若不指定则不使用.use_bias: 布尔值,该层是否使用偏置向量。
#keras.layers.Dense(units, activation=None, use_bias=True,...
#可以使用 TimeDistributed 来将 Dense 层,Conv2D 层独立地应用到这 10 个时间步的每一个.在后续的层中,将不再需要 input_shape:
#输入至少为 3D,且第一个维度应该是时间所表示的维度即timestep。
inputs = Input(shape=(MAX_LEN,), dtype='int32')#定长句子,没有onehot不知词汇表长度,因此最后输入二维数组训练(总样本数,timestep)
mask_ = Masking(mask_value=0)(inputs)
Embedding_ = Embedding(VOCAB_SIZE, EMBED_DIM, mask_zero=True)(mask_)#VOCAB_SIZE已经包括特殊词汇
Bilstm_ = Bidirectional(LSTM(HIDDEN_SIZE, return_sequences=True))(Embedding_)
Dense_ = TimeDistributed(Dense(CLASS_NUMS))(Bilstm_)
outputs = CRF(CLASS_NUMS)(Dense_)
model = Model(inputs=inputs, outputs=outputs)
model.summary()
model.compile(loss=crf_loss, optimizer='adam', metrics=[crf_viterbi_accuracy])
model.fit(train_datas, train_labels, epochs=EPOCHS, verbose=1, validation_split=0.1)
score = model.evaluate(test_datas, test_labels, batch_size=BATCH_SIZE)
print(model.metrics_names)
print(score)
# save model
model.save("lstm_crf_model6.h5")
‘
’
预训练的embedding词向量
大神同学建议使用预训练的embedding词向量,谢谢大神同学!
’
说明:由于现在要获取的是预训练的embedding层,也就是经过Embedding层后词语的词向量矩阵initW。
‘
过程:从网上下载腾讯词向量(非常大),读取“指定词语的词向量”,也就是说,自己的词库有哪些词才读取该词的词向量,很多参考资料里的“word2id”指的是自己的词典。
参考资料
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 13 01:20:09 2020
@author: Lenovo
embedding预训练是使用别的大数据量的数据先训练很多很多词的词向量(类似于onehot)——large*词向量维度(tencent是200)
再从预训练的词向量集合中挑出 现任务字典中部分词汇的词向量——(len(sample)+1)*词向量维度
相当于把embedding的结果直接放在别的模型框架中,设置embedding层系数不再更新。
优点:底层网络与任务无关,可以预先训练
大数据训练效果更好。
embedding —— 比如现在字典只有5个字,一个字[0,0,0,1,0]*[列1,列2,列3]=[列1*,列2*,列3*],由维度5降低到维度3
所有字[1,0,0,0,0 [
..... *[列1,列2,列3]= *
0,0,0,0,1] ]
(5*5) (5*3) (5*3)
中间的矩阵对所有字一视同仁,系数不变。
比如序列[1,2](2D)将被序列[词向量[1],词向量[2]](3D)代替.
"""
import pandas as pd
import numpy as np
#from tqdm import tqdm
#在循环体里边加个tqdm--for i in tqdm(range(vocab_size)),就是用来显示进度条的
import keras
from keras.models import Sequential
from keras.models import Model
from keras.layers import Masking, Embedding, Bidirectional, LSTM, Dense, Input, TimeDistributed, Activation
from keras.preprocessing import sequence
from keras_contrib.layers import CRF
from keras_contrib.losses import crf_loss
from keras_contrib.metrics import crf_viterbi_accuracy
from keras import backend as K
K.clear_session()
def loadEmbedding(embeddingFile, word2id):
with open(embeddingFile, "r", encoding='ISO-8859-1') as f:
header = f.readline()
vocab_size, vector_size = map(int, header.split()) #map规范输出为整型, vector_size是200
initW = np.random.uniform(-0.25,0.25,(len(word2id), vector_size))#word2id表示自己任务字典字数,包括特殊词,且有随机初始化,不需要特别定义特殊词
count = 0
for i in range(vocab_size):
line = f.readline()
lists = line.split(' ') #tencent文件每行是 word wordvector
word = lists[0]
try: word = word.encode('ISO-8859-1').decode('utf8')
except: pass
if word in word2id:
count += 1
number = map(float, lists[1:])
number = list(number)
vector = np.array(number)
initW[word2id[word]] = vector #原来word在第几行,embedding后id(行数)不变。
print(count)
initW[word2id['<PAD>']]=np.zeros(vector_size)
return initW
if __name__ == '__main__':
#######获取预加载词向量###########################
####重载自己的数据库###############
####重载自己的数据库###############
data = pd.read_excel("train_data1.xlsx",header=0,encoding="utf-8")
label= pd.read_excel("train_label1.xlsx",header=0,encoding="utf-8")
train_datas=[]
train_labels=[]
for i in range(len(data)):
line=data.iloc[i,]
temp=[]
for j in range(sum(line.notnull())-1):#line.notnull()返回每行的TRUE(由值),包括序号
w=data.iloc[i,j+1]
temp.append(w)
train_datas.append(temp)
for i in range(len(label)):
line=label.iloc[i,]
temp=[]
for j in range(sum(line.notnull())-1):
w=label.iloc[i,j+1]
temp.append(w)
train_labels.append(temp)
###处理成字典
from collections import Counter
word_counts = Counter(row for sample in train_datas for row in sample)
vocab = [w for w, f in iter(word_counts.items()) if f >= 2]## word_counts.items()转化成(元素,计数值)组成的列表
#np.save('vocab1.npy', vocab)
special_words = ['<PAD>', '<UNK>'] # 特殊词表示0,1
char_vocabs = special_words + vocab
idx2vocab = {idx: char for idx, char in enumerate(char_vocabs)}
vocab2idx = {char: idx for idx, char in idx2vocab.items()}
file = r'E:\Tencent_AILab_ChineseEmbedding.txt'
embedding_matrix=loadEmbedding(file, vocab2idx )#字典有包括'<PAD>', '<UNK>'
#########模型训练###########################
label2idx = {"O": 0,"O-LINK":1,#7:走,船。8:和
"B-LINK": 2, "I-LINK": 3,#9.10:还有
"B-OPER": 4 ,"I-OPER": 5,#和我妈
"O-LOCLINK":6,"B-LOCLINK":7,#去,去了
"I-LOCLINK":8,"O-LOC":9,
"B-LOC": 10, "I-LOC": 11,
"O-TRN": 12,
"B-TRN": 13, "I-TRN": 14,#开车
}
idx2label = {idx: label for label, idx in label2idx.items()}
##########编号
sent=[]
for line in train_datas:
temp=[]
for w in line:
temp.append(vocab2idx[w] if w in vocab2idx else vocab2idx['<UNK>'])
sent.append(temp)
train_datas=sent
label_sent=[]
for line in train_labels:
temp=[]
for w in line:
temp.append(label2idx[w] if w in label2idx else 0)#0-"O"
label_sent.append(temp)
train_labels= label_sent
###划分训练集测试集
testn1=np.random.randint(len(train_datas),size=1000)
test_datas=(np.array(train_datas)[testn1]).tolist()
test_labels =(np.array(train_labels)[testn1]).tolist()
EPOCHS = 20
BATCH_SIZE = 64
#EMBED_DIM = 128#128/300 usually 是一个词周围词语的个数(左1-左5)
HIDDEN_SIZE =128
MAX_LEN = 100
VOCAB_SIZE = len(vocab2idx)
CLASS_NUMS = len(label2idx)
#print(VOCAB_SIZE, CLASS_NUMS)
#keras.preprocessing.sequence.pad_sequences(sequences, maxlen=None, dtype='int32',
#padding='pre', truncating='pre', value=0.)0-PAD
print('padding sequences')
train_datas = sequence.pad_sequences(train_datas, maxlen=MAX_LEN)
train_labels = sequence.pad_sequences(train_labels, maxlen=MAX_LEN)
test_datas = sequence.pad_sequences(test_datas, maxlen=MAX_LEN)
test_labels = sequence.pad_sequences(test_labels, maxlen=MAX_LEN)
print('x_train shape:', train_datas.shape)
print('x_test shape:', test_datas.shape)
train_labels = keras.utils.to_categorical(train_labels, CLASS_NUMS)
test_labels = keras.utils.to_categorical(test_labels, CLASS_NUMS)
print('trainlabels shape:', train_labels.shape)#三维(50658, 100, 10)
print('testlabels shape:', test_labels.shape)
inputs = Input(shape=(MAX_LEN,), dtype='int32')#定长句子,没有onehot不知词汇表长度,因此最后输入二维数组训练(总样本数,timestep)
mask_ = Masking(mask_value=0)(inputs)
Embedding_ = Embedding(VOCAB_SIZE, 200, weights=[embedding_matrix],input_length=MAX_LEN,
mask_zero=True,trainable=False)(mask_)#VOCAB_SIZE已经包括特殊词汇
Bilstm_ = Bidirectional(LSTM(HIDDEN_SIZE, return_sequences=True))(Embedding_)
Dense_ = TimeDistributed(Dense(CLASS_NUMS))(Bilstm_)
outputs = CRF(CLASS_NUMS)(Dense_)
model = Model(inputs=inputs, outputs=outputs)
model.summary()
model.compile(loss=crf_loss, optimizer='adam', metrics=[crf_viterbi_accuracy])
model.fit(train_datas, train_labels, epochs=EPOCHS, verbose=1, validation_split=0.1)
score = model.evaluate(test_datas, test_labels, batch_size=BATCH_SIZE)
print(model.metrics_names)
print(score)
# save model
model.save("lstm_crf_model8.h5")