from flashtext import KeywordProcessor
keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('Big Apple','New York')
keyword_processor.add_keyword('Bay Area')
keywords_found = keyword_processor.extract_keywords('I love Big Apple and Bay Area.')>>> keywords_found
>>># ['New York', 'Bay Area']## 区分大小写from flashtext import KeywordProcessor
keyword_processor = KeywordProcessor(case_sensitive=True)
keyword_processor.add_keyword('Big Apple','New York')
keyword_processor.add_keyword('Bay Area')
keywords_found = keyword_processor.extract_keywords('I love big Apple and Bay Area.')>>> keywords_found
>>># ['Bay Area']
同时添加多个关键词
from flashtext import KeywordProcessor
keyword_processor = KeywordProcessor()
keyword_dict ={"java":["java_2e","java programing"],"product management":["PM","product manager"]}# {'clean_name': ['list of unclean names']}
keyword_processor.add_keywords_from_dict(keyword_dict)# Or add keywords from a list:
keyword_processor.add_keywords_from_list(["java","python"])
keyword_processor.extract_keywords('I am a product manager for a java_2e platform')# output ['product management', 'java']
删除关键字
from flashtext import KeywordProcessor
keyword_processor = KeywordProcessor()
keyword_dict ={"java":["java_2e","java programing"],"product management":["PM","product manager"]}
keyword_processor.add_keywords_from_dict(keyword_dict)print(keyword_processor.extract_keywords('I am a product manager for a java_2e platform'))# output ['product management', 'java']
keyword_processor.remove_keyword('java_2e')# you can also remove keywords from a list/ dictionary
keyword_processor.remove_keywords_from_dict({"product management":["PM"]})
keyword_processor.remove_keywords_from_list(["java programing"])
keyword_processor.extract_keywords('I am a product manager for a java_2e platform')# output ['product management']
函数封装示例
from flashtext import KeywordProcessor
defbuild_actree(wordlist):'''
AC自动机进行关键词匹配
构造AC trie
'''
actree = KeywordProcessor()for index, word inenumerate(wordlist):
actree.add_keyword(word)# 向trie树中添加单词#self.actree = actreereturn actree
defac_detect(actree,text,span_info =True):'''
AC自动机进行关键词匹配
文本匹配
'''
region_wds =[]for w1 in actree.extract_keywords(text,span_info = span_info):iflen(w1)>0:
region_wds.append(w1[0])return region_wds
wordlist =['健康','减肥']
text ='今天你减肥了吗,今天你健康了吗,减肥 = 健康!'
actree = build_actree(wordlist)
ac_detect(actree,text)>>> CPU times: user 41 µs, sys:0 ns, total:41 µs
>>> Wall time:47.2 µs
>>>['减肥','健康','减肥','健康']