- 自己写的分组Tire树匹配算法,该算法用于云南省人工智能重点实验室与云南电网合作项目(云南电网敏感信息识别系统),用于快速匹配文本将项目中数据算法抽离出来,特此分享!!!
- 可以实现动态的插入、删除操作
class TireNode:
def __init__(self):
self.children = {}
self.group_ids = set()
class Tire:
def __init__(self):
self.root = TireNode()
def insert(self, word, group_id):
node = self.root
for char in word:
if char not in node.children:
node.children[char] = TireNode()
node = node.children[char]
if group_id not in node.group_ids:
node.group_ids.add(group_id)
return True
else:
return False
def search(self, word):
node = self.root
for char in word:
if char not in node.children:
return None, word
node = node.children[char]
if len(node.group_ids) != 0:
return node.group_ids,word
return None, word
def delete(self, group_id, word):
node = self.root
for char in word:
if char not in node.children:
return False
node = node.children[char]
if group_id not in node.group_ids:
return False
else:
node.group_ids.remove(group_id)
return True
class KeyWords(object):
def __init__(self):
self.tire = Tire()
self.tire_group_ids = {}
self.gjc_lists = [["电网信息", "电网"], []]
for group_id, keywords in enumerate(self.gjc_lists):
for keyword in keywords:
success = self.tire.insert(keyword, group_id)
if success:
if group_id not in self.tire_group_ids:
self.tire_group_ids[group_id] = 1
else:
self.tire_group_ids[group_id] += self.tire_group_ids[group_id]
print()
def match(self, text):
group_dict = {}
for i in range(len(text)):
for j in range(i + 1, len(text) + 1):
group_ids, group_word = self.tire.search(text[i:j])
if group_ids is not None:
for group_id in group_ids:
if group_id not in group_dict:
group_dict[group_id] = 1
else:
group_dict[group_id] += 1
if group_dict[group_id] == self.tire_group_ids[group_id]:
return True
return False
def delete(self, group_id, word):
success = self.tire.delete(group_id, word)
if success:
if group_id in self.tire_group_ids:
self.tire_group_ids[group_id] -= 1
return success
def insert(self, group_id, word):
success = self.tire.insert(word, group_id)
if success:
if group_id not in self.tire_group_ids:
self.tire_group_ids[group_id] = 1
else:
self.tire_group_ids[group_id] += 1