切分算法

最新推荐文章于 2023-08-06 15:47:31 发布

Mr_cfl

最新推荐文章于 2023-08-06 15:47:31 发布

阅读量464

点赞数 2

文章标签： python 自然语言处理

本文链接：https://blog.csdn.net/Mr_cfl/article/details/110654671

版权

def fully_segment(text,dic):
    word_list = []
    for i in range(len(text)):
        for j in range(i+1,len(text)+1):
            word = text[i:j]
            if word in dic:
                word_list.append(word)
    return word_list



def forward_segment(text,dic):
    word_list = []
    i = 0
    while i < len(text):
        longest_word = text[i]
        for j in range(i+1,len(text)+1):
            word = text[i:j]
            if word in dic:
                if len(word) > len(longest_word):
                    longest_word = word
        word_list.append(longest_word)
        i += len(longest_word)
    return word_list




def backward_segment(text,dic):
    word_list = []
    i = len(text) - 1
    while i >= 0:
        longest_word = text[i]
        for j in range(0,i):
            word = text[j:i+1]
            if word in dic:
                if len(word) > len(longest_word):
                    longest_word = word
                    break
        word_list.insert(0,longest_word)
        i -= len(longest_word)
    return word_list




def count_single_char(word_list:list):
    return sum(1 for word in word_list if len(word) == 1)
def bidirectional_segment(text,dic):
    f = forward_segment(text,dic)
    b = backward_segment(text,dic)
    if len(f) < len(b):
        return f
    elif len(f) > len(b):
        return b
    else:
        if count_single_char(f) < count_single_char(b):
            return f
        else:
            return b




dic=['商','商品','品','和','和服','服','服务','务',
     '就','就读','读','北','北京','北京大学','京','大','大学','学',
     '研究','研究生','生','生命','起源',
     '欢','欢迎','新','迎新','老','老师','师生','生前','前来','来','就餐',
     '项','项目','目的','的','研究',
     '当','当下','下雨天','雨天','地面','积水',
     '结婚','和尚','尚未','未']
while 1:
    print('请输入句子：')
    text=input()
    print('完全切分:\n',fully_segment(text,dic))
    print('最长匹配:\n',forward_segment(text,dic))
    print('逆向最长匹配:\n',backward_segment(text,dic))
    print('双向最长匹配:\n',bidirectional_segment(text,dic))