FMM算法的最简单思想是使用贪心算法向前找n个,如果这n个组成的词在词典中出现,就ok,如果没有出现,那么找n-1个...然后继续下去。假如n个词在词典中出现,那么从n+1位置继续找下去,知道句子结束。
代码片段(3)
[代码] [Python]代码
01 | import re |
02 | def PreProcess(sentence,edcode = "utf-8" ): |
03 | sentence = sentence.decode(edcode) |
04 | sentence = re.sub(u "[。,,!……!《》<>/"'::?/?、/|“”‘’;]" , " " ,sentence) |
05 | return sentence |
06 | |
07 | def FMM(sentence,diction,result = [],maxwordLength = 4 ,edcode = "utf-8" ): |
08 | i = 0 |
09 | sentence = PreProcess(sentence,edcode) |
10 | length = len (sentence) |
11 | while i < length: |
12 | # find the ascii word |
13 | tempi = i |
14 | tok = sentence[i:i + 1 ] |
15 | while re.search( "[0-9A-Za-z/-/+#@_/.]{1}" ,tok)<> None : |
16 | i = i + 1 |
17 | tok = sentence[i:i + 1 ] |
18 | if i - tempi> 0 : |
19 | result.append(sentence[tempi:i].lower().encode(edcode)) |
20 | # find chinese word |
21 | left = len (sentence[i:]) |
22 | if left = = 1 : |
23 | """go to 4 step over the FMM""" |
24 | """should we add the last one? Yes, if not blank""" |
25 | if sentence[i:] <> " " : |
26 | result.append(sentence[i:].encode(edcode)) |
27 | return result |
28 | m = min (left,maxwordLength) |
29 | |
30 | for j in xrange (m, 0 , - 1 ): |
31 | leftword = sentence[i:j + i].encode(edcode) |
32 | # print leftword.decode(edcode) |
33 | if LookUp(leftword,diction): |
34 | # find the left word in dictionary |
35 | # it's the right one |
36 | i = j + i |
37 | result.append(leftword) |
38 | break |
39 | elif j = = 1 : |
40 | """only one word, add into result, if not blank""" |
41 | if leftword.decode(edcode) <> " " : |
42 | result.append(leftword) |
43 | i = i + 1 |
44 | else : |
45 | continue |
46 | return result |
47 | def LookUp(word,dictionary): |
48 | if dictionary.has_key(word): |
49 | return True |
50 | return False |
51 | def ConvertGBKtoUTF(sentence): |
52 | return sentence.decode( 'gbk' ).encode( 'utf-8' ) |
[代码] [Python]代码
01 | dictions = {} |
02 | dictions[ "ab" ] = 1 |
03 | dictions[ "cd" ] = 2 |
04 | dictions[ "abc" ] = 1 |
05 | dictions[ "ss" ] = 1 |
06 | dictions[ConvertGBKtoUTF( "好的" )] = 1 |
07 | dictions[ConvertGBKtoUTF( "真的" )] = 1 |
08 | sentence = "asdfa好的是这样吗vasdiw呀真的daf dasfiw asid是吗?" |
09 | s = FMM(ConvertGBKtoUTF(sentence),dictions) |
10 | for i in s: |
11 | print i.decode( "utf-8" ) |
[代码] [Python]代码
1 | test = open ( "test.txt" , "r" ) |
2 | for line in test: |
3 | s = FMM(CovertGBKtoUTF(line),dictions) |
4 | for i in s: |
5 | print i.decode( "utf-8" ) |