参考文章
参考:分词算法;英文特定领域词组切分-借鉴中文的前向,后向,双向最大匹配;
算法原理
参考:中文分词引擎 java 实现 — 正向最大、逆向最大、双向最大匹配法;
可以将英文按照空格切分后,把每个词当成中文中的一个字,借助构建好的词典实现最大匹配分词。
字典准备(一小部分国家词汇和安全组织词汇)
United Kingdom
United States
United States Minor Outlying Islands
Uruguay
Yemen
Zambia
Zimbabwe
Behaviosec
Berkeley Varitronics Systems
Beyond Identity
BeyondTrust
BigID
i-Sprint Innovations
iboss
lastline
neXus
root9B
vArmour
wolfSSL
1. 前向最大匹配
实现:
class ForwardMaxMatch:
def __init__(self, dic_path=f'dic.txt', max_len=5):
self.max_len = max_len
self.dic_file = dic_path
self.dic = self.load_dic()
self.tokens = []
def load_dic(self):
with open(self.dic_file, 'r', encoding='utf-8') as f:
data = f.readlines()
dic = [i.strip() for i in data]
return dic
def segment(self, word_obj):
if isinstance(word_obj, str):
character_map = {".": ' . ',
",": ' , ',
"?": ' ? '}
for origin, new in character_map.items():
word_obj = word_obj.replace(origin, new)
self.words_list = word_obj.split() # 我是需要区分大小写的,就没有全转为小写
elif isinstance(word_obj, list):
self.words_list = word_obj
else:
print("Not support object for segmentation!")
i = 0
tokens = []
while i < len(self.words_list):
maxWords = []
reverse = self.max_len + i
while reverse > i:
grams = self.words_list[i: reverse]
reverse -= 1
tempWords = ' '.join(grams)
if tempWords in self.dic:
maxWords = grams
break
if maxWords:
i += len(maxWords)
tokens.append(' '.join(maxWords))
else:
tokens.append(' '.join(self.words_list[i: i + 1]))
i += 1
self.tokens = tokens
return tokens
def __call__(self, word_obj):
return self.segment(word_obj)
def __repr__(self):
return self.tokens
if __name__ == '__main__':
in_str = 'Is the headquarters of the Berkeley Varitronics Systems organization in the United States or the United Kingdom?'
forward = ForwardMaxMatch()
a = forward.segment(in_str)
print(a)
'''
['Is', 'the', 'headquarters', 'of', 'the', 'Berkeley Varitronics Systems', 'organization', 'in', 'the', 'United States', 'or', 'the', 'United Kingdom', '?']
'''
2 后向最大匹配
实现:
class BackwardMaxMatch:
def __init__(self, dic=f'dic.txt', max_len=5):
self.max_len = max_len
self.dic_file = dic
self.dic = self.load_dic()
self.tokens = []
def load_dic(self):
with open(self.dic_file, 'r', encoding='utf-8') as f:
data = f.readlines()
dic = [i.strip() for i in data]
return dic
def segment(self, word_obj):
if isinstance(word_obj, str):
character_map = {".": ' . ',
",": ' , ',
"?": ' ? '}
for origin, new in character_map.items():
word_obj = word_obj.replace(origin, new)
self.words_list =word_obj.split()
elif isinstance(word_obj, list):
self.words_list = word_obj
else:
print("Not support object for segmentation!")
i = -1
tokens = []
while i > -len(self.words_list):
maxWords = []
reverse = -self.max_len + i
while reverse < i:
grams = self.words_list[reverse: i]
reverse += 1
tempWords = ' '.join(grams)
if tempWords in self.dic:
maxWords = grams
break
if maxWords:
i -= len(maxWords)
tokens.append(' '.join(maxWords))
else:
tokens.append(' '.join(self.words_list[i-1: i]))
i -= 1
self.tokens = list(reversed(tokens))
if self.words_list[-1] == '.': self.tokens.append('.')
return self.tokens
def __call__(self, word_obj):
return self.segment(word_obj)
def __repr__(self):
return self.tokens
3 双向最大匹配
将前向最大匹配和后向最大匹配算法进行比较,从而确定正确的分词方法。
算法流程:
- 比较正向最大匹配和逆向最大匹配结果
- 如果分词数量结果不同,那么取分词数量较少的那个
- 如果分词数量结果相同
- 分词结果相同,可以返回任何一个
- 分词结果不同,返回单字数比较少的那个
实现
def BidirectMaxMatch(string):
back_tokenizer = BackwardMaxMatch()
back_tokens = back_tokenizer.segment(string)
forward_tokenizer = ForwardMaxMatch()
forward_tokens = forward_tokenizer.segment(string)
if len(back_tokens) == len(forward_tokens):
if back_tokens == forward_tokens:
return back_tokens
else:
back_single_w_cnt = sum([0 if len(bt.split(' ')) > 1 else 1 for bt in back_tokens])
forward_single_w_cnt = sum([0 if len(bt.split(' ')) > 1 else 1 for bt in forward_tokens])
if back_single_w_cnt < forward_single_w_cnt:
return back_tokens
else:
return forward_tokens
else:
if len(back_tokens) < len(forward_tokens):
return back_tokens
else:
return forward_tokens