正向最大匹配
# -*- coding:utf-8 -*-
CODEC="utf-8"
def u(s, encoding):
"converted other encoding to unicode encoding"
if isinstance(s, unicode):
return s
else:
return unicode(s, encoding)
def fwd_mm_seg(wordDict, maxLen, str):
"forward max match segment"
wordList = []
segStr = str
segStrLen = len(segStr)
for word in wordDict:
print "word: ", word
print " "
while segStrLen > 0:
if segStrLen > maxLen:
wordLen = maxLen
else:
wordLen = segStrLen
subStr = segStr[0:wordLen]
print "subStr: ", subStr
while wordLen > 1:
if subStr in wordDict:
print "subStr1: %r" % subStr
break
else:
print "subStr2: %r" % subStr
wordLen = wordLen - 1
subStr = subStr[0:wordLen]
# print "subStr3: ", subStr
wordList.append(subStr)
segStr = segStr[wordLen:]
segStrLen = segStrLen - wordLen
for wordstr in wordList:
print "wordstr: ", wordstr
return wordList
def main():
fp_dict = open("words.dic")
wordDict = {}
for eachWord in fp_dict:
wordDict[u(eachWord.strip(), "utf-8")] = 1
segStr = u"你好世界hello world"
print segStr
wordList = fwd_mm_seg(wordDict, 10, segStr)
print "==".join(wordList)
if __name__ == "__main__":
main()
逆向最大匹配
# -*- coding:utf-8 -*-
def u(s, encoding):
"converted other encoding to unicode encoding"
if isinstance(s, unicode):
return s
else:
return unicode(s, encoding)
CODEC="utf-8"
def bwd_mm_seg(wordDict, maxLen, str):
"forward max match segment"
wordList = []
segStr = str
segStrLen = len(segStr)
for word in wordDict:
print "word: ", word
print " "
while segStrLen > 0:
if segStrLen > maxLen:
wordLen = maxLen
else:
wordLen = segStrLen
subStr = segStr[-wordLen:None]
print "subStr: ", subStr
while wordLen > 1:
if subStr in wordDict:
print "subStr1: %r" % subStr
break
else:
print "subStr2: %r" % subStr
wordLen = wordLen - 1
subStr = subStr[-wordLen:None]
# print "subStr3: ", subStr
wordList.append(subStr)
segStr = segStr[0: -wordLen]
segStrLen = segStrLen - wordLen
wordList.reverse()
for wordstr in wordList:
print "wordstr: ", wordstr
return wordList
def main():
fp_dict = open("words.dic")
wordDict = {}
for eachWord in fp_dict:
wordDict[u(eachWord.strip(), "utf-8")] = 1
segStr = ur"你好世界hello world"
print segStr
wordList = bwd_mm_seg(wordDict, 10, segStr)
print "==".join(wordList)
if __name__ == "__main__":
main()
以上这篇python正向最大匹配分词和逆向最大匹配分词的实例就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持脚本之家。