python100例 分词-python正向最大匹配分词和逆向最大匹配分词的实例

正向最大匹配

# -*- coding:utf-8 -*-

CODEC="utf-8"

def u(s, encoding):

"converted other encoding to unicode encoding"

if isinstance(s, unicode):

return s

else:

return unicode(s, encoding)

def fwd_mm_seg(wordDict, maxLen, str):

"forward max match segment"

wordList = []

segStr = str

segStrLen = len(segStr)

for word in wordDict:

print "word: ", word

print " "

while segStrLen > 0:

if segStrLen > maxLen:

wordLen = maxLen

else:

wordLen = segStrLen

subStr = segStr[0:wordLen]

print "subStr: ", subStr

while wordLen > 1:

if subStr in wordDict:

print "subStr1: %r" % subStr

break

else:

print "subStr2: %r" % subStr

wordLen = wordLen - 1

subStr = subStr[0:wordLen]

# print "subStr3: ", subStr

wordList.append(subStr)

segStr = segStr[wordLen:]

segStrLen = segStrLen - wordLen

for wordstr in wordList:

print "wordstr: ", wordstr

return wordList

def main():

fp_dict = open("words.dic")

wordDict = {}

for eachWord in fp_dict:

wordDict[u(eachWord.strip(), "utf-8")] = 1

segStr = u"你好世界hello world"

print segStr

wordList = fwd_mm_seg(wordDict, 10, segStr)

print "==".join(wordList)

if __name__ == "__main__":

main()

逆向最大匹配

# -*- coding:utf-8 -*-

def u(s, encoding):

"converted other encoding to unicode encoding"

if isinstance(s, unicode):

return s

else:

return unicode(s, encoding)

CODEC="utf-8"

def bwd_mm_seg(wordDict, maxLen, str):

"forward max match segment"

wordList = []

segStr = str

segStrLen = len(segStr)

for word in wordDict:

print "word: ", word

print " "

while segStrLen > 0:

if segStrLen > maxLen:

wordLen = maxLen

else:

wordLen = segStrLen

subStr = segStr[-wordLen:None]

print "subStr: ", subStr

while wordLen > 1:

if subStr in wordDict:

print "subStr1: %r" % subStr

break

else:

print "subStr2: %r" % subStr

wordLen = wordLen - 1

subStr = subStr[-wordLen:None]

# print "subStr3: ", subStr

wordList.append(subStr)

segStr = segStr[0: -wordLen]

segStrLen = segStrLen - wordLen

wordList.reverse()

for wordstr in wordList:

print "wordstr: ", wordstr

return wordList

def main():

fp_dict = open("words.dic")

wordDict = {}

for eachWord in fp_dict:

wordDict[u(eachWord.strip(), "utf-8")] = 1

segStr = ur"你好世界hello world"

print segStr

wordList = bwd_mm_seg(wordDict, 10, segStr)

print "==".join(wordList)

if __name__ == "__main__":

main()

以上这篇python正向最大匹配分词和逆向最大匹配分词的实例就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持脚本之家。

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值