FMM算法的最简单思想是使用贪心算法向前找n个,如果这n个组成的词在词典中出现,就ok,如果没有出现,那么找n-1个...然后继续下去。假如n个词在词典中出现,那么从n+1位置继续找下去,知道句子结束。
代码片段(3)
[代码][Python]代码
01 | import re |
02 | def PreProcess(sentence,edcode = "utf-8" ): |
03 |
sentence = sentence.decode(edcode) |
04 |
sentence = re.sub(u "[。,,!……!《》<>\"'::?\?、\|“”‘’;]" , " " ,sentence) |
05 |
return sentence |
06 | |
07 | def FMM(sentence,diction,result = [],maxwordLength = 4 ,edcode = "utf-8" ): |
08 |
i = 0 |
09 |
sentence = PreProcess(sentence,edcode) |
10 |
length = len (sentence) |
11 |
while i < length: |
12 |
# find the ascii word |
13 |
tempi = i |
14 |
tok = sentence[i:i + 1 ] |
15 |
while re.search( "[0-9A-Za-z\-\+#@_\.]{1}" ,tok)<> None : |
16 |
i = i + 1 |
17 |
tok = sentence[i:i + 1 ] |
18 |
if i - tempi> 0 : |
19 |
result.append(sentence[tempi:i].lower().encode(edcode)) |
20 |
# find chinese word |
21 |
left = len (sentence[i:]) |
22 |
if left = = 1 : |
23 |
"""go to 4 step over the FMM""" |
24 |
"""should we add the last one? Yes, if not blank""" |
25 |
if sentence[i:] <> " " : |
26 |
result.append(sentence[i:].encode(edcode)) |
27 |
return result |
28 |
m = min (left,maxwordLength) |
29 | |
30 |
for j in xrange (m, 0 , - 1 ): |
31 |
leftword = sentence[i:j + i].encode(edcode) |
32 |
# print leftword.decode(edcode) |
33 |
if LookUp(leftword,diction): |
34 |
# find the left word in dictionary |
35 |
# it's the right one |
36 |
i = j + i |
37 |
result.append(leftword) |
38 |
break |
39 |
elif j = = 1 : |
40 |
"""only one word, add into result, if not blank""" |
41 |
if leftword.decode(edcode) <> " " : |
42 |
result.append(leftword) |
43 |
i = i + 1 |
44 |
else : |
45 |
continue |
46 |
return result |
47 | def LookUp(word,dictionary): |
48 |
if dictionary.has_key(word): |
49 |
return True |
50 |
return False |
51 | def ConvertGBKtoUTF(sentence): |
52 |
return sentence.decode( 'gbk' ).encode( 'utf-8' ) |
[代码][Python]代码
01 | dictions = {} |
02 | dictions[ "ab" ] = 1 |
03 | dictions[ "cd" ] = 2 |
04 | dictions[ "abc" ] = 1 |
05 | dictions[ "ss" ] = 1 |
06 | dictions[ConvertGBKtoUTF( "好的" )] = 1 |
07 | dictions[ConvertGBKtoUTF( "真的" )] = 1 |
08 | sentence = "asdfa好的是这样吗vasdiw呀真的daf dasfiw asid是吗?" |
09 | s = FMM(ConvertGBKtoUTF(sentence),dictions) |
10 | for i in s: |
11 |
print i.decode( "utf-8" ) |
[代码][Python]代码
1 | test = open ( "test.txt" , "r" ) |
2 | for line in test: |
3 |
s = FMM(CovertGBKtoUTF(line),dictions) |
4 |
for i in s: |
5 |
print i.decode( "utf-8" ) |