摘要:最近有一笔任务,命名实体识别数据太少,需要自己标注数据,结果网上一搜,CRF识别模型代码遍地,但关于数据标注的寥寥,于是从头将NER过程走了一遍,序列标注及NER模型训练。(当练练代码能力了...)
任务:将已有的实体反标回源文件,并标记出B-character,I-character,作为训练数据
主要过程:1.原文本预处理(句子切分-去除标点等)
2.存words,labels两个list,分别存单个字及对应标签(B-I-O)(example:张 B-character)
3.word,list打包存成tuple形式[(张,B-character)]
4.写入txt,做训练数据
详细实现步骤见下完整代码
import tqdm
import glob
from collections import defaultdict
import json
import re
entities_dict = defaultdict()
type_dic = {'人物': 'character', '行业': 'industry', '业务': 'industry', '产品': 'product', '研报': 'report',
'机构': 'institutions', '风险': 'risk', '文章': 'articles', '指标': 'indicators', '品牌': 'brand'}
# KMP
def KMP_algorithm(string, substring):
'''
KMP字符串匹配的主函数
若存在字串返回字串在字符串中开始的位置下标,或者返回-1
'''
pnext = gen_pnext(substring)
n = len(string)
m = len(substring)
i, j = 0, 0
while (i < n) and (j < m):
if (string[i] == substring[j]):
i += 1
j += 1
elif (j != 0):
j = pnext[j - 1]
else:
i += 1
if (j == m):
return i - j
else:
return -1
def gen_pnext(substring):
"""
构造临时数组pnext
"""
index, m = 0, len(substring)
pnext = [0] * m
i = 1
while i < m:
if (substring[i] == substring[index]):
pnext[i] = index + 1
index += 1
i += 1
elif (index != 0):
index = pnext[index - 1]
else:
pnext[i] = 0
i += 1
return pnext
# 所有识别出来的实体
entities_json = open('../hanlp_rules/entities_hanlp_rules_entity.txt', 'r+', encoding='utf8').read()
entities_json = json.loads(entities_json)
print(type(entities_json))
# print('***', entities_json)
words, labels = [], []
# 源数据切分处理 return sentences
for file_path in tqdm.tqdm(glob.glob('../yanbao_txt/*.txt')):
# Read origin data
text = open(file_path, encoding='utf-8').read() # txt文件的保存格式应为utf-8,开头字符\ufeff不用管
# Get split sentences
# print('text', text)
sentences = re.split('[,。!?、‘’“”]/[bems]', text) # 去掉标点符号,如?/s的形式,斜杠前面是标点,后面是标注
# print('sentences-1', sentences)
sentences = re.split(r'[\\n:\[\],。!\' ?;/s]', str(sentences))
# print('sentences--', sentences)
# Filter sentences whose length is 0
sentences = list(filter(lambda x: x.strip(),
sentences)) # strip删除开头结尾空白字符,filter过滤序列,返回True的元素放入新列表python3中filter返回的并不是一个list,而是一个filter对象
# print('sentences-2', sentences)
# Strip sentences
sentences = list(map(lambda x: x.strip(), sentences)) # map() 会根据提供的函数对指定序列做映射
# print('sentences-3', sentences)
# print('Start creating words and labels...')
for sentence in sentences:
for alpha in sentence:
words.append(alpha)
words.append(' ')
for sentence in sentences:
for typ, entities in entities_json.items():
for entity in entities:
ind = KMP_algorithm(sentence, entity)
if ind != -1:
typ_temp = typ
break
# print('类型', typ)########## ##########
if ind != -1 or typ == '品牌':
# print('load over')
break # 默认一个sentence里只出现一个entity ##########
for i in range(len(sentence)):
if ind == -1:
labels.append('O')
continue
if i < ind:
labels.append('O')
elif i == ind:
labels.append('B-' + type_dic[typ_temp])
elif ind < i < ind + len(entity):
labels.append('I-' + type_dic[typ_temp])
else:
labels.append('O')
labels.append(' ')
# break
print('Words Length', len(words), 'Labels Length', len(labels))
print('Words Example', words[:1000])
print('Labels Example', labels[:1000])
# words,labels合并成tuple
txt_tuple = list(zip(words, labels))
# 写入txt
with open('../new_note.txt', 'w', encoding='utf8') as note_txt:
for one_tuple in txt_tuple:
note_txt.write(' '.join(t for t in one_tuple) + '\n')
entities_hanlp_rules_entity.txt这个文件就是实体:
格式是这样的,供大家作为参考
# 数据格式大概是这样,十个类型,不全,只是给个样例,可以按照样例构造自己的数据代入
{
"人物":[
"薇娅",
"黄海",
"林平",
"汤普森"
],
"行业":[
"电子设备",
"服饰",
"电影",
"电商"
]
],
"业务":[
"电子设备",
"服饰",
"电影",
"电商"
]
],
"产品":[
"电子设备",
"服饰",
"电影",
"电商"
]
......
# 共十个类型,对应代码中字典中10个类型,不枚举了
}
-----------------------------------
下篇见NER(BERT+CRF)模型训练过程~