关于中文分词的一些资料网上资料很多,大家可以自己去了解了解,今天这里只关注代码怎么写。
中文分词主要可以归纳为“规则分词”、“统计分词”以及“规则+统计”三个主要派别,今天主要了解“规则分词”中常见的正向、逆向和双向最大化匹配,这三个都是基于现在词典做的,所以得准备一个中文词典,一行一个词。
一.正向最大化匹配
描述:
1.找到词典中最长的词,记下长度L
2.从 “左向右” 取长度为L的字符串,查找词典进行匹配,若匹配成功,则将这个词切分出来。若匹配失败,将这个字符串的最后一个字符去掉,将剩下的串作为新的匹配串 进行匹配。如此重复下去,直到切完。
二.逆向最大化匹配
描述:
1.找到词典中最长的词,记下长度L
2.从 ”右向左“ 取长度为L的字符串,查找词典进行匹配,若匹配成功,则将这个词切分出来。若匹配失败,将这个字符串的最前面一个字符去掉,将剩下的串作为新的匹配 串进行匹配。如此重复下去,直到切完。
三.双向最大化匹配
描述:
1.将正向和逆向进行比较,先取词数切分最少的作为结果。
四.代码采用python
1.load 词典
1 #!/usr/bin/env python
2 # -\*- coding:utf-8 -\*-
3
4 #这是词典路径
5 dictPath = '../resource/dict.txt'
6
7 def loadDict(): 8 print('load dict...')
9 dictionary = dict()
10 maximum = 0
11 # read resource
12 with open(dictPath, 'r', encoding='utf8') as f:
13 for line in f:
14 line = line.strip()
15 if not line:
16 continue
17 str = line.split(' ')
18 dictionary\[str\[0\]\] = str\[2\]
19 wordLength = len(line)
20 if wordLength > maximum:
21 maximum = wordLength #词典中最长的词的长度
22 return dictionary, maximum
View Code
2.核心方法
1 #!/usr/bin/env python
2 # -\*- coding:utf-8 -\*-
3
4 from word\_segmentation.regulation.ReverseMaximumMatchMethod import RMM 5 from word\_segmentation.regulation.MaximumMatchMethod import MM 6 from word\_segmentation.regulation.BiDirectctionMatchMethod import BDMM 7 from word\_segmentation.util.LoadDict import loadDict 8
9 class RegulationMatch(object):
10 def \_\_init\_\_(self):
11 self.dictionary, self.maximum = loadDict()
12
13 def cut(self, text, method):
14 #逆向
15 if method == 'RMM':
16 return RMM.cut(text, self.dictionary, self.maximum)
17 #正向
18 if method == 'MM':
19 return MM.cut(text, self.dictionary, self.maximum)
20 #双向
21 if method == 'BDMM':
22 return BDMM.cut(text, self.dictionary, self.maximum)
View Code
1 #!/usr/bin/env python
2 # -\*- coding:utf-8 -\*-
3
4 '''
5 词和词性
6 '''
7 class Word(object) : 8 def \_\_init\_\_(self, token, property):
9 self.\_\_token = token
10 self.\_\_property = property
11 #单词
12 def getToken(self):
13 return self.\_\_token
14 #词性
15 def getProperty(self):
16 return self.\_\_property
View Code
1 #!/usr/bin/env python
2 # -\*- coding:utf-8 -\*-
3
4 from word\_segmentation.regulation.Word import Word 5
6 '''
7 正向最大化匹配
8 MaximumMatchMethod
9 '''
10 class MM(object):
11 def \_\_init\_\_(self):
12 pass
13
14 @staticmethod
15 def cut(text, dictionary, maximum):
16 result = \[\]
17 textLength = len(text)
18 start = 0
19 while textLength > 0:
20 word = None
21 for size in range(maximum, 0, -1):
22 if textLength - size < 0:
23 continue
24 piece = text\[start:(start + size)\]
25 if dictionary.\_\_contains\_\_(piece):
26 word = piece
27 result.append(Word(piece, dictionary.get(piece)))
28 textLength -= size
29 start += size
30 break
31 if word is None:
32 textLength -= 1
33 return result
View Code
1 #!/usr/bin/env python
2 # -\*- coding:utf-8 -\*-
3
4 from word\_segmentation.regulation.Word import Word 5
6 '''
7 逆向最大化匹配
8 ReverseMaximumMatchMethod
9 '''
10 class RMM(object):
11 def \_\_init\_\_(self):
12 pass
13
14 @staticmethod
15 def cut(text, dictionary, maximum):
16 result = \[\]
17 textLength = len(text)
18 while textLength > 0:
19 word = None
20 for size in range(maximum, 0, -1):
21 if textLength - size < 0:
22 continue
23 piece = text\[(textLength - size) : textLength\]
24 if dictionary.\_\_contains\_\_(piece):
25 word = piece
26 result.append(Word(piece, dictionary.get(piece)))
27 textLength -= size
28 break
29 if word is None:
30 textLength -= 1
31 return result\[::-1\]
View Code
1 # -\*- coding:utf-8 -\*-
2
3 from word\_segmentation.regulation.MaximumMatchMethod import MM 4 from word\_segmentation.regulation.ReverseMaximumMatchMethod import RMM 5
6 '''
7 比较正向最大匹配和逆向最大匹配结果:
8 1.如果分词数量结果不同,那么取分词数量较少的那个
9 2.如果分词数量结果相同
10 a.分词结果相同,可以返回任何一个
11 b.分词结果不同,返回单字数比较少的那个
12 c.分词结果不同,单字数相同,返回谁呢(可以返回逆向分词结果)
13 '''
14 class BDMM(object):
15 def \_\_init\_\_(self):
16 pass
17
18 @staticmethod
19 def cut(text, dictionary, maximum):
20 mmResult = MM.cut(text, dictionary, maximum)
21 rmmResult = RMM.cut(text, dictionary, maximum)
22 mmSegment = \[\]
23 rmmSegment = \[\]
24 for word in mmResult:
25 mmSegment.append(word.getToken())
26 # print('token = %s, property = %s' %(word.getToken(), word.getProperty()))
27 for word in rmmResult:
28 rmmSegment.append(word.getToken())
29
30 if mmSegment.\_\_len\_\_() < rmmSegment.\_\_len\_\_():
31 return mmResult
32 elif mmSegment.\_\_len\_\_() == rmmSegment.\_\_len\_\_():
33 flag = True
34 for segment in mmSegment:
35 if segment not in rmmSegment:
36 flag = False
37 break
38 if flag:
39 return mmResult
40 else:
41 mmSingleWords = 0
42 rmmSingleWords = 0
43 for word in mmSegment:
44 if len(word) == 1:
45 mmSingleWords += 1
46 for word in rmmSegment:
47 if len(word) == 1:
48 rmmSingleWords += 1
49 if mmSingleWords < rmmSingleWords:
50 return mmResult
51 else:
52 return rmmResult
53 else:
54 return rmmResult
View Code
1 #!/usr/bin/env python
2 # -\*- coding:utf-8 -\*-
3
4 from word\_segmentation.regulation.ReverseMaximumMatchMethod import RMM 5 import word\_segmentation.regulation.MaximumMatchMethod 6 import word\_segmentation.regulation.BiDirectctionMatchMethod 7 from word\_segmentation.regulation.RegulationMatchMthod import RegulationMatch 8
9 def test():
10 pass
11 if \_\_name\_\_ == '\_\_main\_\_':
12 text = '各国有各国的困难…'
13 print('分词:')
14 print('各国有各国的困难…')
15 regulation = RegulationMatch()
16 mmResult = regulation.cut(text, 'MM')
17 rmmResult = regulation.cut(text, 'RMM')
18 bdmmResult = regulation.cut(text, 'BDMM')
19 mmSegment = \[\]
20 rmmSegment = \[\]
21 bdmmSegment = \[\]
22 for word in mmResult:
23 mmSegment.append(word.getToken())
24 #print('token = %s, property = %s' %(word.getToken(), word.getProperty()))
25 for word in rmmResult:
26 rmmSegment.append(word.getToken())
27 for word in bdmmResult:
28 bdmmSegment.append(word.getToken())
29
30 print('正向匹配: %s' % mmSegment)
31 print('逆向匹配: %s' % rmmSegment)
32 print('双向匹配: %s' % bdmmSegment)
View Code