LTP--提取时间人物地点

最新推荐文章于 2024-12-16 14:21:49 发布

xuanningmeng

最新推荐文章于 2024-12-16 14:21:49 发布

阅读量2k

点赞数 2

分类专栏： NLP 文章标签：自然语言处理 python

本文链接：https://blog.csdn.net/weixin_42223207/article/details/116110532

版权

NLP 专栏收录该内容

25 篇文章

订阅专栏

本文介绍了如何利用哈工大LTP平台进行高效的中文文本处理，详细展示了如何通过分词、词性标注和命名实体识别技术，精准抽取文本中的时间、人物和地点信息。通过示例代码，读者可以学习如何改进时间实体提取并优化人物和地点的识别精度。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

LTP

LTP是哈工大社会计算与信息检索研究中心历时多年研发的一整套高效、高精度的中文自然语言处理开源基础技术平台。该平台集词法分析（分词、词性标注、命名实体识别）、句法分析（依存句法分析）和语义分析（语义角色标注、语义依存分析）等多项自然语言处理技术于一体。笔者尝试用LTP提取时间，人物，地点，言论观点句。

LTP使用

ltp_data_path = args["ltp_data_path"]
# ner model path
ner_model_path = os.path.join(ltp_data_path, "ner.model")
# token model path
cws_model_path = os.path.join(ltp_data_path, 'cws.model')
# 词性标注模型
pos_model_path = os.path.join(ltp_data_path, 'pos.model')
# 句法依存模型
par_model_path = os.path.join(ltp_data_path, 'parser.model')
# 自定义字典路径
lexicon_path = args["lexicon_path"]

# 分词
segmentor = Segmentor()
segmentor.load_with_lexicon(cws_model_path, lexicon_path)

# 词性标注
postagger = Postagger()
postagger.load(pos_model_path)

# ner
recognizer = NamedEntityRecognizer()
recognizer.load(ner_model_path)

# 依存句法分析
parser = Parser()   #初始化实例，依存句法分析
parser.load(par_model_path)

LTP-- 提取时间

利用分词和词性标注提取时间，代码如下

def extract_time(text, segmentor, postagger):
    """get time entity"""
    words = list(segmentor.segment(text))
    postags = list(postagger.postag(words))
    # 提取时间（连续多个时间合并）
    time_words_list = []
    i = 0
    for tag, word in zip(postags, words):
        if tag == 'nt':
            j = i
            while postags[j] == 'nt' or words[j] in ['至', '到', ]:
                j += 1
            time_words_list.append(''.join(words[i:j]))
        i += 1
    # 去重
    remove_list = []
    for i in time_words_list:
        for j in time_words_list:
            if i != j and i in j:
                remove_list.append(i)

    time_words_list_1 = []
    for item in time_words_list:
        if item not in remove_list:
            time_words_list_1.append(item)
    final_time = list(set(time_words_list_1))
    return final_time

LTP–提取人物

def extract_person_entity(context, segmentor, postagger, recognizer):
    # token
    token = segmentor.segment(context)
    # token post tag
    postags = postagger.postag(token)
    # ner
    netags = list(recognizer.recognize(token, postags))
    persons = set()
    i = 0
    for tag, word in zip(netags, token):
        j = i
        # 人名
        if 'Nh' in tag:
            if str(tag).startswith('S'):
                persons.add(word)
            elif str(tag).startswith('B'):
                union_person = word
                while netags[j] != 'E-Nh':
                    j += 1
                    if j < len(token):
                        union_person += token[j]
                persons.add(union_person)
    persons = [iterm for iterm in list(persons) if len(iterm) > 1]
    return persons

LTP-- 提取地点

def extract_location_entity(sentences, segmentor, postagger, recognizer):
    """
    :param sentences: list['', '']
    :param segmentor:
    :param postagger:
    :param recognizer:
    :return: location entity ,type list
    """
    places = set()
    # for context in sentences:
    # token
    token = segmentor.segment(sentences[0])
    # token post tag
    postags = postagger.postag(token)
    # ner
    netags = list(recognizer.recognize(token, postags))

    i = 0
    for tag, word in zip(netags, token):
        j = i
        # 地名
        if 'Ns' in tag:
            if str(tag).startswith('S'):
                places.add(word)
            elif str(tag).startswith('B'):
                union_place = word
                while netags[j] != 'E-Ns':
                    j += 1
                    if j < len(token):
                        union_place += token[j]
                places.add(union_place)
    final_location = [iterm for iterm in list(places) if len(iterm) > 1 and len(iterm) < 6]
    city = [iterm for iterm in final_location if "市" in iterm]
    if len(city) > 0:
        city_name = city[0].replace("市", '')
        if city_name in final_location:
            final_location.remove(city[0])
    return final_location