如何使用AC自动机算法,快速识别出某个字符串中的关键词

如何使用AC自动机算法,快速识别出某个字符串中的关键词

如一条长链接:www.dji.com/cn/huawei/1111-sale-7knbdw/1111-sale-7knbd​​​​​​​w/1111-sale-7knbd,

需要识别出长链接中的品牌名称和所在行业。

表1 一列:长链接

A表 二列:品牌小写,品牌关键词

B表 一列:品牌大写

C表 五列:

brand_code(品牌小写),brand_name(品牌大写),

big_industry_name(行业1),small_industry_code(行业2),big_industry_code(行业3)

输出表:

长链接,品牌大写,品牌小写,行业1,行业2,行业3

data3.sql(提取出品牌所有的关键词)

select *
from
(select t1.brand_code ,t1.brand_keywords ,t2.brand_name
from A表 t1
left join B表 t2
on t1.brand_code = t2.brand_code)a1
where brand_keywords is not null
 ;

'''
全库跑长链接,识别出长链接中的品牌
'''

import psycopg2
from psycopg2.extras import execute_batch
import psycopg2.extras
from config.load_config import get_connect_info
import ahocorasick



class Link_Postgres():
    def __init__(self):
        conf1, conf2 = get_connect_info()
        self.conn1 = psycopg2.connect(
            dbname=conf1["dbname"],
            user=conf1["user"],
            password=conf1["password"],
            host=conf1["host"],
            port=conf1["port"]
        )
        self.cur1 = self.conn1.cursor()



    '''
    1 运行SQL,查询brand_name,brand_code,brand_keywords 数据
    2 查询表数据,拿出long_link
    3 返回结果列表,key_list--brand_keywords,test_str_list--long_link,map_dict1--{brand_keywords,brand_code},map_dict2--{brand_keywords,brand_name}
    '''
    def match_brand1(self):
        with open('D:\project\project01\sql\data3.sql', encoding='utf-8', mode='r') as f:
            sql1 = f.read()
            self.cur1.execute(sql1)
            res = self.cur1.fetchall()

        sql2 = f"select long_link from 表1 where long_link is not null and date_time  < '20230531'"
        self.cur1.execute(sql2)
        res2 = self.cur1.fetchall()

        self.map_dic1 = {}
        for i in res:
            for j in i[1].split(','):
                self.map_dic1[j] = i[0]
        # key 是关键词,values brand_code

        self.map_dic2 = {}
        for i in res:
            for j in i[1].split(','):
                self.map_dic2[j] = i[2]
        # key 是关键词,values brand_name

        self.key_list = list(self.map_dic1.keys())

        self.test_str_list = []
        for i in res2:
            self.test_str_list.append(i[0])
        return self.key_list, self.test_str_list, self.map_dic1, self.map_dic2

    '''
    1 匹配出长链接中对应的品牌
    2 final_result=[('long_link','brand_code','brand_name')]
    '''
    def match_brand2(self):
        self.final_result = []
        AC_KEY = ahocorasick.Automaton()
        for word in set(self.key_list):
            AC_KEY.add_word(word, word)
        AC_KEY.make_automaton()

        for content in self.test_str_list:
            name_list = set()
            for item in AC_KEY.iter(content):
                name_list.add(item[1])
            name_list = list(name_list)

            brandcode_list = [self.map_dic1.get(i, "") for i in name_list if self.map_dic1.get(i, "")]
            brandname_list = [self.map_dic2.get(i, "") for i in name_list if self.map_dic2.get(i, "")]

            if brandcode_list:
                self.final_result.append((content, ",".join(brandcode_list), ",".join(brandname_list)))
        return self.final_result

    '''
    1 读取数据库表数据,拿到品牌对应的行业
    2 将long_link,brand_code,brand_name,big_industry_name,small_industry_code,big_industry_code 进行匹配
    3 创建新表,将结果导入新表中
    '''
    def match_industry(self):
        sql3 = f'select brand_code,brand_name,big_industry_name,small_industry_code,big_industry_code from C表'
        self.cur1.execute(sql3)
        res3 = self.cur1.fetchall()

        self.map_dic3 = {}
        for i in self.final_result:
            for j in i[1].split(','):
                self.map_dic3[j] = i[0]

        self.result = []
        for person in res3:
            i,y,p,j,k = person
            if i in self.map_dic3:
                map_dic4 = self.map_dic3[i]
                self.result.append({'long_link': map_dic4, 'brand_code': i, 'brand_name': y, 'big_industry_name': p, 'small_industry_code': j, 'big_industry_code': k})

    def write_industry_table(self):

        self.cur1.execute('create table if not exists youtube_longlink_brand_industry(long_link text,brand_code text,brand_name text,big_industry_name text,small_industry_code text,big_industry_code text)')

        sql = "insert into youtube_longlink_brand_industry(long_link,brand_code,brand_name,big_industry_name,small_industry_code,big_industry_code) values %s"
        data = [(item["long_link"],item["brand_code"],item["brand_name"],item["big_industry_name"],item["small_industry_code"],item["big_industry_code"]) for item in self.result]

        psycopg2.extras.execute_values(self.cur1,sql,data)
        self.conn1.commit()
        print('长链接解析品牌和行业成功')


lp = Link_Postgres()
lp.match_brand1()
lp.match_brand2()
lp.match_industry()
lp.write_industry_table()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值