Python---正向、逆向和双向最大匹配算法

使用python实现正向、逆向和双向最大匹配算法
正向最大匹配

class leftMax(object):
    def __init__(self,dict_path):
        self.dictionary = set() #定义字典
        self.maximum = 0 #最大匹配长度
        
        with open(dict_path,'r',encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                self.dictionary.add(line.split('\t')[1])
                if len(line) > self.maximum:
                    self.maximum = len(line)
                    
    def cut(self,text):
        result = []
        length = len(text)
        index = 0
        while length > 0:
            word = None
            for size in range(self.maximum,0,-1):
                if length - size < 0:
                    continue
                piece = text[index:index+size]
                if piece in self.dictionary:
                    word = piece
                    result.append(word)
                    length -= size
                    index += size
                    break
            if word is None:
                length -= 1
                result.append(text[index])
                index += 1
        return result
def main():
    text = "北京大学生前来应聘算法工程师岗位"
    tokenizer = leftMax('XXX/ChineseDic.txt')
    print(tokenizer.cut(text))
        
main()
['北京大学', '生前', '来', '应聘', '算法', '工程师', '岗位']

逆向最大匹配

class rightMax(object):
    def __init__(self,dict_path):
        self.dictionary = set() #定义字典
        self.maximum = 0 #最大匹配长度
        
        with open(dict_path,'r',encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                self.dictionary.add(line.split('\t')[1])
                if len(line) > self.maximum:
                    self.maximum = len(line)
    def cut(self,text):
        result = []
        index = len(text)
        while index > 0:
            word = None
            for size in range(self.maximum,0,-1):
                if index - size < 0:
                    continue
                piece = text[(index - size):index]
                if piece in self.dictionary:
                    word = piece
                    result.append(word)
                    index -= size
                    break
            if word is None:
            	result.append(text[(index-1):index])
                index -= 1
        return result[::-1]#由于append为添加至末尾,故需反向打印
    
def main():
    text = "北京大学生前来应聘算法工程师岗位"
    tokenizer = rightMax('XXX/ChineseDic.txt')
    print(tokenizer.cut(text))
        
main()
['北京', '大学生', '前来', '应聘', '算法', '工程师', '岗位']

双向最大匹配

def doubleMax(text,path):
    left = leftMax(path)
    right = rightMax(path)
    
    leftMatch = left.cut(text)
    rightMatch = right.cut(text)
    
    #返回分词数较少者
    if (len(leftMatch) != len(rightMatch)):
        if (len(leftMatch) < len(rightMatch)):
            return leftMatch 
        else:
            return rightMatcht
    else:#若分词数量相同,进一步判断
        leftsingle = 0
        rightsingle = 0
        isEqual = True #用以标志结果是否相同
        for i in range(len(leftMatch)):
            if(leftMatch[i] != rightMatch[i]):
                isEqual = False
            #统计单字数
            if(len(leftMatch[i])==1):
                leftsingle += 1
            if(len(rightMatch[i])==1):
                rightsingle += 1
        if(isEqual):
            return leftMatch
        if(leftsingle < rightsingle):
            return leftMatch
        else:
            return rightMatch

text = "北京大学生前来应聘算法工程师岗位"
print(doubleMax(text,'XXX/ChineseDic.txt'))
['北京', '大学生', '前来', '应聘', '算法', '工程师', '岗位']

ChineseDic.txt词典可从该处下载:中文分词 词库
完!

评论 8
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值