python正向最大匹配分词和逆向最大匹配分词

34 篇文章 6 订阅

正向最大匹配

# -*- coding:utf-8 -*-

CODEC='utf-8'

def u(s, encoding):
    'converted other encoding to unicode encoding'
    if isinstance(s, unicode):
        return s
    else:
        return unicode(s, encoding)

def fwd_mm_seg(wordDict, maxLen, str):
    'forward max match segment'
    wordList = []
    segStr = str
    segStrLen = len(segStr)
    for word in wordDict:
        print 'word: ', word
    print "\n"
    while segStrLen > 0:
        if segStrLen > maxLen:
            wordLen = maxLen
        else:
            wordLen = segStrLen
        subStr = segStr[0:wordLen]
        print "subStr: ", subStr
        while wordLen > 1:
            if subStr in wordDict:
                print "subStr1: %r" % subStr
                break
            else:
                print "subStr2: %r" % subStr
                wordLen = wordLen - 1
                subStr = subStr[0:wordLen]
#            print "subStr3: ", subStr
        wordList.append(subStr)
        segStr = segStr[wordLen:]
        segStrLen = segStrLen - wordLen
    for wordstr in wordList:
        print "wordstr: ", wordstr
    return wordList
        
            
def main():
    fp_dict = open('words.dic')
    wordDict = {}
    for eachWord in fp_dict:
        wordDict[u(eachWord.strip(), 'utf-8')] = 1
    segStr = u'你好世界hello world'
    print segStr
    wordList = fwd_mm_seg(wordDict, 10, segStr)
    print "==".join(wordList)
    

if __name__ == '__main__':
    main()
    
    
逆向最大匹配
# -*- coding:utf-8 -*-


def u(s, encoding):
    'converted other encoding to unicode encoding'
    if isinstance(s, unicode):
        return s
    else:
        return unicode(s, encoding)

CODEC='utf-8'

def bwd_mm_seg(wordDict, maxLen, str):
    'forward max match segment'
    wordList = []
    segStr = str
    segStrLen = len(segStr)
    for word in wordDict:
        print 'word: ', word
    print "\n"
    while segStrLen > 0:
        if segStrLen > maxLen:
            wordLen = maxLen
        else:
            wordLen = segStrLen
        subStr = segStr[-wordLen:None]
        print "subStr: ", subStr
        while wordLen > 1:
            if subStr in wordDict:
                print "subStr1: %r" % subStr
                break
            else:
                print "subStr2: %r" % subStr
                wordLen = wordLen - 1
                subStr = subStr[-wordLen:None]
#            print "subStr3: ", subStr
        wordList.append(subStr)
        segStr = segStr[0: -wordLen]
        segStrLen = segStrLen - wordLen
    wordList.reverse()
    for wordstr in wordList:
        print "wordstr: ", wordstr
    return wordList
        
            
def main():
    fp_dict = open('words.dic')
    wordDict = {}
    for eachWord in fp_dict:
        wordDict[u(eachWord.strip(), 'utf-8')] = 1
    segStr = ur'你好世界hello world'
    print segStr
    wordList = bwd_mm_seg(wordDict, 10, segStr)
    print "==".join(wordList)

if __name__ == '__main__':
    main()
    
    


Python中,我们可以使用动态规划(DP)来实现正向最大匹配逆向最大匹配算法。这里我们假设`wordsdic.txt`是一个文本文件,包含单词列表。为了简化,我们将仅展示核心代码,并未包括读取文件的时间计算部分,因为这通常会依赖于实际文件操作。 **正向最大匹配 (KMP 算法)**: ```python def build_lps(pattern): lps = [0] * len(pattern) j = -1 for i in range(1, len(pattern)): while j > -1 and pattern[i] != pattern[j + 1]: j = lps[j] if pattern[i] == pattern[j + 1]: j += 1 lps[i] = j + 1 return lps def kmp_search(words, pattern): words_list = open('wordsdic.txt', 'r').read().splitlines() lps = build_lps(pattern) max_match = 0 matched_words = [] for word in words_list: i, j = 0, 0 while i < len(word) and j < len(pattern): if word[i] == pattern[j]: i += 1 j += 1 elif j > 0: j = lps[j - 1] else: i += 1 if j == len(pattern): max_match += 1 matched_words.append((word, max_match)) # 如果当前word不足以匹配pattern,则移动到下一个位置 else: i -= 1 print("Matched Words:", matched_words) # 使用示例 kmp_search(words=words_list, pattern='example') ``` **逆向最大匹配 (Manacher's Algorithm)**: 由于Manacher's Algorithm更适用于处理字符串上的最长回文子串问题,而此处的需求是匹配单词,直接应用可能不是最优化的选择。但对于好奇者,它的基本原理可以这样描述: ```python def manacher(s): # ... 实现Manacher's Algorithm的代码 ... kmp_search(words=words_list, pattern='example', method='manacher') # 示例,需自行实现Manacher's Algorithm部分 ``` 对于时间计算,你可以使用`timeit`库来测量函数运行时间。例如,在每次匹配之后加上一行: ```python import timeit start_time = timeit.default_timer() # ... 执行匹配算法 ... end_time = timeit.default_timer() print(f"Time elapsed: {end_time - start_time} seconds") ```
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值