最大匹配算法与双向最大匹配算法实现-CSDN博客

本文链接：https://blog.csdn.net/qq_57063581/article/details/123715585

第一关

def cutA(sentence, dictA):

# sentence：要分词的句子

#dictA：机器词典

result = [] #存放分好的词

sentenceLen = len(sentence) #待分词字段长度

n = 0

maxDictA = max([len(word) for word in dictA]) #取词典中最长词条的字符长度

while sentenceLen>0:

maxCutLen = min(maxDictA,sentenceLen) #如果当前字段长度小于词典最长字段长度时，取当前字段长度进行划分

sub_sen = sentence[0:maxCutLen]

while maxCutLen>0:

if sub_sen in dictA:

result.append(sub_sen)

break

elif len(sub_sen)==1:

#长度为1说明词典中并无此词，则直接放入result中

result.append(sub_sen)

break

else:

#否则，删去最后一个字，重新操作

maxCutLen-=1

sub_sen = sub_sen[0:maxCutLen]

#更新当前未匹配字段及其长度

sentence = sentence[maxCutLen:]

sentenceLen -=maxCutLen

print(result) # 输出分词结果

第二

def cutB(sentence,dictB):

result = []

sentenceLen = len(sentence)

maxDictB = max([len(word) for word in dictB])

while sentenceLen>0:

maxCutLen = min(sentenceLen,maxDictB)

sub_sen = sentence[-maxCutLen:]

while maxCutLen>0:

if sub_sen in dictB:

result.append(sub_sen)

break

elif len(sub_sen)==1:

result.append(sub_sen)

break

else:

maxCutLen-=1

sub_sen = sub_sen[-maxCutLen:]

sentence = sentence[0:-maxCutLen]

sentenceLen -= maxCutLen

print(result[::-1],end="")

第三

class BiMM():

def __init__(self):

self.window_size = 3 # 字典中最长词数

def MMseg(self, text, dict): # 正向最大匹配算法

result = []

index = 0

text_length = len(text)

while text_length > index:

for size in range(self.window_size + index, index, -1):

piece = text[index:size]

if piece in dict:

index = size - 1

break

index += 1

result.append(piece)

return result

def RMMseg(self, text, dict): # 逆向最大匹配算法

result = []

index = len(text)

while index > 0:

for size in range(index - self.window_size, index):

piece = text[size:index]

if piece in dict:

index = size + 1

break

index = index - 1

result.append(piece)

result.reverse()

return result

#r1和r2分别为正向最大匹配法和逆向最大匹配法的切割列表

def main(self, text, r1, r2):

if len(r1) > len(r2):

print(r2,end="")

elif len(r1) < len(r2):

print(r1,end="")

else:

num1 = len(list(filter(lambda s: isinstance(s, str) and len(s) == 1, r1))) # filter()用于过滤，提取列表中长度为1的字符

规则分词法