LTP
LTP是哈工大社会计算与信息检索研究中心历时多年研发的一整套高效、高精度的中文自然语言处理开源基础技术平台。该平台集词法分析(分词、词性标注、命名实体识别)、句法分析(依存句法分析)和语义分析(语义角色标注、语义依存分析)等多项自然语言处理技术于一体。笔者尝试用LTP提取时间,人物,地点,言论观点句。
LTP使用
ltp_data_path = args["ltp_data_path"]
# ner model path
ner_model_path = os.path.join(ltp_data_path, "ner.model")
# token model path
cws_model_path = os.path.join(ltp_data_path, 'cws.model')
# 词性标注模型
pos_model_path = os.path.join(ltp_data_path, 'pos.model')
# 句法依存模型
par_model_path = os.path.join(ltp_data_path, 'parser.model')
# 自定义字典路径
lexicon_path = args["lexicon_path"]
# 分词
segmentor = Segmentor()
segmentor.load_with_lexicon(cws_model_path, lexicon_path)
# 词性标注
postagger = Postagger()
postagger.load(pos_model_path)
# ner
recognizer = NamedEntityRecognizer()
recognizer.load(ner_model_path)
# 依存句法分析
parser = Parser() #初始化实例,依存句法分析
parser.load(par_model_path)
LTP-- 提取时间
利用分词和词性标注提取时间,代码如下
def extract_time(text, segmentor, postagger):
"""get time entity"""
words = list(segmentor.segment(text))
postags = list(postagger.postag(words))
# 提取时间(连续多个时间合并)
time_words_list = []
i = 0
for tag, word in zip(postags, words):
if tag == 'nt':
j = i
while postags[j] == 'nt' or words[j] in ['至', '到', ]:
j += 1
time_words_list.append(''.join(words[i:j]))
i += 1
# 去重
remove_list = []
for i in time_words_list:
for j in time_words_list:
if i != j and i in j:
remove_list.append(i)
time_words_list_1 = []
for item in time_words_list:
if item not in remove_list:
time_words_list_1.append(item)
final_time = list(set(time_words_list_1))
return final_time
LTP–提取人物
def extract_person_entity(context, segmentor, postagger, recognizer):
# token
token = segmentor.segment(context)
# token post tag
postags = postagger.postag(token)
# ner
netags = list(recognizer.recognize(token, postags))
persons = set()
i = 0
for tag, word in zip(netags, token):
j = i
# 人名
if 'Nh' in tag:
if str(tag).startswith('S'):
persons.add(word)
elif str(tag).startswith('B'):
union_person = word
while netags[j] != 'E-Nh':
j += 1
if j < len(token):
union_person += token[j]
persons.add(union_person)
persons = [iterm for iterm in list(persons) if len(iterm) > 1]
return persons
LTP-- 提取地点
def extract_location_entity(sentences, segmentor, postagger, recognizer):
"""
:param sentences: list['', '']
:param segmentor:
:param postagger:
:param recognizer:
:return: location entity ,type list
"""
places = set()
# for context in sentences:
# token
token = segmentor.segment(sentences[0])
# token post tag
postags = postagger.postag(token)
# ner
netags = list(recognizer.recognize(token, postags))
i = 0
for tag, word in zip(netags, token):
j = i
# 地名
if 'Ns' in tag:
if str(tag).startswith('S'):
places.add(word)
elif str(tag).startswith('B'):
union_place = word
while netags[j] != 'E-Ns':
j += 1
if j < len(token):
union_place += token[j]
places.add(union_place)
final_location = [iterm for iterm in list(places) if len(iterm) > 1 and len(iterm) < 6]
city = [iterm for iterm in final_location if "市" in iterm]
if len(city) > 0:
city_name = city[0].replace("市", '')
if city_name in final_location:
final_location.remove(city[0])
return final_location
笔者尝试使用LTP抽取时间人物地点,其中时间抽取涉及到了字典,抽取不是很准确。如有错误,欢迎大家指正。