【python 走进NLP】搜索提示功能前缀字典树

最新推荐文章于 2024-01-18 21:32:39 发布

置顶东华果汁哥

最新推荐文章于 2024-01-18 21:32:39 发布

阅读量1.1k

点赞数 3

分类专栏：数据科学--python

本文链接：https://blog.csdn.net/u013421629/article/details/89318975

版权

数据科学--python 专栏收录该内容

289 篇文章 34 订阅

订阅专栏

站内搜索是一个网站的基本功能，一个好的搜索提示也能很好的提升用户体验，提高用户找到自己需要的东西的效率。下面基于字典树实现一个简单的中文前缀搜索提示功能。

用户输入的时候自动提示。
在这里插入图片描述


# -*- encoding=utf-8 -*-


# 导入包
from pytrie import StringTrie

# 自定义字典树类
class Suggester(object):
    def __init__(self):
        self.trie = None
		self.trie = StringTrie()
    def update_trie(self, word_list):
        for word in word_list:
            word = word.lower()
            self.trie[word] = word

    def search_prefix(self, prefix):
        return self.trie.values(prefix=prefix)


# 建立前缀字典树
def build_prefix_tree(wordlist):
    word_list = open(wordlist).read().splitlines()
    suggester = Suggester()
    suggester.update_trie(word_list)
    return suggester





if __name__ == '__main__':

    sug = Suggester()
    sug.update_trie(['意味深长','意想不到','意气用事','意气风发','意兴阑珊','意气高昂'])

    print(sug.search_prefix('意'))
    print(sug.search_prefix('意气'))

运行结果：

E:\laidefa\python.exe E:/短信报警/SearchSuggestion-master/backend/实现搜索提示功能.py
['意想不到', '意兴阑珊', '意味深长', '意气风发', '意气高昂', '意气用事']
['意气风发', '意气高昂', '意气用事']

Process finished with exit code 0

更复杂的搜索提示功能：支持拼音，首字母拼音，中文等。类似优酷，酷狗等搜索提示功能。完美实现。
在这里插入图片描述

在这里插入图片描述
安装包：

pip install pytrie
pip install pandas
pip install pypinyin

在这里插入图片描述

搜索提示功能实现：

# -*- encoding=utf-8 -*-

# 导入包
from pytrie import StringTrie
import pypinyin
import pandas as pd
import time


# 文本转拼音
def pinyin(text):
    """
    :param text: 文本
    :return: 文本转拼音
    """
    gap = ' '
    piny = gap.join(pypinyin.lazy_pinyin(text))
    return piny


# 获取拼音的每个首字母
def get_every_word_first(text):
    """
    :param text:文本
    :return: 返回拼音首字母
    """
    return ''.join([i[0] for i in pinyin(text).split(' ')])



# 获取拼音的第一个首字母
def get_all_pinying(text):
    """
        :param text: 文本
        :return: 文本转拼音
        """
    gap = ''
    piny = gap.join(pypinyin.lazy_pinyin(text))

    return piny



# 自定义字典树类
class Suggester(object):
    def __init__(self):
        self.trie = None
        self.trie = StringTrie()

    def update_trie(self, word_list):
        for word in word_list:
            word = word.lower()
            # 拼音提取
            word_pinyin1=get_every_word_first(word)
            word_pinyin2=get_all_pinying(word)

            # 拼音建立字典树
            self.trie[word] = word
            self.trie[word_pinyin1]=word_pinyin1
            self.trie[word_pinyin2] = word_pinyin2


    def search_prefix(self, prefix):
        return self.trie.values(prefix=prefix)



# 构建字典树
def build_all_trie(wordlist):
    """
    :param wordlist: 关键词列表
    :return: 字典树和映射数据集
    """
    sug = Suggester()
    sug.update_trie(wordlist)
    data = pd.DataFrame({"word": wordlist})
    data['pinyin1'] = data['word'].apply(lambda x: get_every_word_first(x))
    data['pinyin2'] = data['word'].apply(lambda x: get_all_pinying(x))

    return sug,data


# 判断字符串只包含中文
def check_contain_chinese(check_str):
    flag = True
    for ch in check_str:
        if u'\u4e00' >= ch or ch >= u'\u9fff':
            flag =  False
    return flag




# 关键词搜索提示查询
def get_tips_word(sug,data,s):
    """
    :param sug: 字典树
    :param data: 中文和英文映射数据集
    :param s: 搜索词
    :return: 返回搜索提示词
    """
    try:
        if len(s)>0:
            # 判断输入是否只包含中文，若只中文，按中文查
            if check_contain_chinese(s) is True:
                # 输出结果
                kk = sug.search_prefix(s)
                result3 = data[data['word'].isin(kk)]
                result6 = list(set(result3['word']))
                return result6

            # 若不是只包含中文，转换为英文去查询
            else:
                s1=get_all_pinying(s)
                kk = sug.search_prefix(s1)
                result1 = data[data['pinyin1'].isin(kk)]
                result2 = data[data['pinyin2'].isin(kk)]
                result3 = data[data['word'].isin(kk)]
                result4 = result1.append(result2, ignore_index=True)
                result5 = result3.append(result4, ignore_index=True)
                # 输出结果
                result6 = list(set(result5['word']))
                return result6
        else:
            return

    except Exception as e:
        print("{0}".format(str(e)))




if __name__ == '__main__':

    wordlist=['意味深长','意想不到','意气用事','意气风发','意兴阑珊','意气高昂','意气相投','巴黎恋人','巴黎圣母院','巴黎宝贝']

    # 构造字典树
    sug, data = build_all_trie(wordlist)
    time1=time.time()


    # 搜索词
    s='b'

    result=get_tips_word(sug,data,s)
    print(result)

    time2=time.time()
    print('总共耗时：' + str(time2- time1) + 's')

运行结果：

E:\laidefa\python.exe E:/短信报警/SearchSuggestion-master/backend/实现搜索提示功能.py
['巴黎圣母院', '巴黎宝贝', '巴黎恋人']
总共耗时：0.002985239028930664s

Process finished with exit code 0