python esmre库实现word查找
前言:
在文本中匹配特定的字符串,一般可以用普通的字符串匹配算法,KMP算法;
python中提供了一个库,esmre, 通过预先将字符串存到esm对象中,利用这些字符串从候选的字符串中进行匹配,返回匹配位置,支持同一个词语的多次匹配。效率比正则表达式快。
安装:
pip install esmre -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/
import esm
import re
index = esm.Index()
index.enter("he")
index.enter("she")
index.enter("his")
index.enter("hers")
index.fix()
query1 = "this here is history"
query2 = "Those are his sheep!"
# 使用esmre
out1 = index.query(query1)
out2 = index.query(query2)
print('out1=', out1, '\nstr1=', query1[out1[0][0][0]:out1[0][0][1]])
print('out2=', out2, '\nstr2=', query1[out2[0][0][0]:out2[0][0][1]])
# 使用正则表达式
out3 = re.search(r"he|she|his|hers", query1)
print('out3=',out3)
out4 = re.search(r"xxx|yyy", query2)
print('out4=',out4)
'''
out1= [((1, 4), 'his'), ((5, 7), 'he'), ((13, 16), 'his')]
str1= his
out2= [((10, 13), 'his'), ((14, 17), 'she'), ((15, 17), 'he')]
str2= is
out3= <re.Match object; span=(1, 4), match='his'>
out4= None
'''
从Query中搜索预先定义好的关键词,返回关键词的位置
import os
import sys
import esm
import json
import random
class AcMatchWord(object):
def __init__(self, vocab_path_list):
"""
初始化ac自动机
"""
self.automation = esm.Index()
self.build_ac(vocab_path_list)
def build_ac(self, vocab_path_list):
"""
构建ac自动机
"""
for vocab_path in vocab_path_list:
label = ""
with open(vocab_path, "r", encoding="utf8") as f:
for line in f:
line = line.strip()
if not line:
continue
if line.startswith("[D:"):
label = line
continue
lines = line.split("\t")
if len(lines) == 2:
term, std_term = lines
elif len(lines) == 1:
term = lines[0]
std_term = lines[0]
if not label:
continue
self.automation.enter(term, (term, std_term, label))
self.automation.fix()
def search(self, query):
"""
搜索命中的词汇
"""
ac_search_result = set()
for end_index, result in self.automation.query(query):
key_word = query.encode("utf8")[end_index[0]: end_index[1]].decode("utf8")
ac_search_result.add(result)
return ac_search_result
参考:
1.敏感词匹配——python使用esmre实现ac自动机[多模匹配]
2,.esmre 1.0.1
3.python ac模块_python使用esmre代替ahocorasick实现ac自动机[多模匹配]