如何使用AC自动机算法,快速识别出某个字符串中的关键词
如一条长链接:www.dji.com/cn/huawei/1111-sale-7knbdw/1111-sale-7knbdw/1111-sale-7knbd,
需要识别出长链接中的品牌名称和所在行业。
表1 一列:长链接
A表 二列:品牌小写,品牌关键词
B表 一列:品牌大写
C表 五列:
brand_code(品牌小写),brand_name(品牌大写),
big_industry_name(行业1),small_industry_code(行业2),big_industry_code(行业3)
输出表:
长链接,品牌大写,品牌小写,行业1,行业2,行业3
data3.sql(提取出品牌所有的关键词)
select * from (select t1.brand_code ,t1.brand_keywords ,t2.brand_name from A表 t1 left join B表 t2 on t1.brand_code = t2.brand_code)a1 where brand_keywords is not null ;
'''
全库跑长链接,识别出长链接中的品牌
'''
import psycopg2
from psycopg2.extras import execute_batch
import psycopg2.extras
from config.load_config import get_connect_info
import ahocorasick
class Link_Postgres():
def __init__(self):
conf1, conf2 = get_connect_info()
self.conn1 = psycopg2.connect(
dbname=conf1["dbname"],
user=conf1["user"],
password=conf1["password"],
host=conf1["host"],
port=conf1["port"]
)
self.cur1 = self.conn1.cursor()
'''
1 运行SQL,查询brand_name,brand_code,brand_keywords 数据
2 查询表数据,拿出long_link
3 返回结果列表,key_list--brand_keywords,test_str_list--long_link,map_dict1--{brand_keywords,brand_code},map_dict2--{brand_keywords,brand_name}
'''
def match_brand1(self):
with open('D:\project\project01\sql\data3.sql', encoding='utf-8', mode='r') as f:
sql1 = f.read()
self.cur1.execute(sql1)
res = self.cur1.fetchall()
sql2 = f"select long_link from 表1 where long_link is not null and date_time < '20230531'"
self.cur1.execute(sql2)
res2 = self.cur1.fetchall()
self.map_dic1 = {}
for i in res:
for j in i[1].split(','):
self.map_dic1[j] = i[0]
# key 是关键词,values brand_code
self.map_dic2 = {}
for i in res:
for j in i[1].split(','):
self.map_dic2[j] = i[2]
# key 是关键词,values brand_name
self.key_list = list(self.map_dic1.keys())
self.test_str_list = []
for i in res2:
self.test_str_list.append(i[0])
return self.key_list, self.test_str_list, self.map_dic1, self.map_dic2
'''
1 匹配出长链接中对应的品牌
2 final_result=[('long_link','brand_code','brand_name')]
'''
def match_brand2(self):
self.final_result = []
AC_KEY = ahocorasick.Automaton()
for word in set(self.key_list):
AC_KEY.add_word(word, word)
AC_KEY.make_automaton()
for content in self.test_str_list:
name_list = set()
for item in AC_KEY.iter(content):
name_list.add(item[1])
name_list = list(name_list)
brandcode_list = [self.map_dic1.get(i, "") for i in name_list if self.map_dic1.get(i, "")]
brandname_list = [self.map_dic2.get(i, "") for i in name_list if self.map_dic2.get(i, "")]
if brandcode_list:
self.final_result.append((content, ",".join(brandcode_list), ",".join(brandname_list)))
return self.final_result
'''
1 读取数据库表数据,拿到品牌对应的行业
2 将long_link,brand_code,brand_name,big_industry_name,small_industry_code,big_industry_code 进行匹配
3 创建新表,将结果导入新表中
'''
def match_industry(self):
sql3 = f'select brand_code,brand_name,big_industry_name,small_industry_code,big_industry_code from C表'
self.cur1.execute(sql3)
res3 = self.cur1.fetchall()
self.map_dic3 = {}
for i in self.final_result:
for j in i[1].split(','):
self.map_dic3[j] = i[0]
self.result = []
for person in res3:
i,y,p,j,k = person
if i in self.map_dic3:
map_dic4 = self.map_dic3[i]
self.result.append({'long_link': map_dic4, 'brand_code': i, 'brand_name': y, 'big_industry_name': p, 'small_industry_code': j, 'big_industry_code': k})
def write_industry_table(self):
self.cur1.execute('create table if not exists youtube_longlink_brand_industry(long_link text,brand_code text,brand_name text,big_industry_name text,small_industry_code text,big_industry_code text)')
sql = "insert into youtube_longlink_brand_industry(long_link,brand_code,brand_name,big_industry_name,small_industry_code,big_industry_code) values %s"
data = [(item["long_link"],item["brand_code"],item["brand_name"],item["big_industry_name"],item["small_industry_code"],item["big_industry_code"]) for item in self.result]
psycopg2.extras.execute_values(self.cur1,sql,data)
self.conn1.commit()
print('长链接解析品牌和行业成功')
lp = Link_Postgres()
lp.match_brand1()
lp.match_brand2()
lp.match_industry()
lp.write_industry_table()