Python基于Elasticsearch实现搜索引擎

最新推荐文章于 2024-05-11 17:17:44 发布

mankvis

最新推荐文章于 2024-05-11 17:17:44 发布

阅读量9.2k

点赞数 5

分类专栏： Python Elastic 文章标签： elasticsearch

本文链接：https://blog.csdn.net/baoshuowl/article/details/80557549

版权

Python 同时被 2 个专栏收录

18 篇文章 2 订阅

订阅专栏

Elastic

3 篇文章 1 订阅

订阅专栏

＆nbsp; ＆NBSP; ＆NBSP; ＆NBSP; ElasticSearch是一个基于Lucene的搜索服务器。它提供了一个分布式多用户能力的全文搜索引擎，基于RESTful Web接口.Elasticsearch是用Java开发的，并作为Apache许可条款的开放源码发布，是当前流行的企业级搜索引擎设计用于云计算中，能够达到实时搜索，稳定，可靠，快速，安装使用方便。

1.准备工作

已经安装elasticsearch-rtf >>>安装教程
已经安装elasticsearch-head >>>安装教程
一个完美的爬虫项目

2.填充数据

要想制作一款搜索引擎，首先数据库里面得有大量的数据，如果数据库里面都没有数据，那这个搜索引擎还能教搜索引擎吗？所以我们先来爬取大量的数据，这里写了一个小说网的爬虫，以搜索小说为例。

编写model.py文件，编写完毕调用init函数，创建es索引的mapping

#coding:utf-8
from elasticsearch_dsl import DocType,Completion,Text,Boolean,Integer,Date
from elasticsearch_dsl.connections import connections
from elasticsearch_dsl.analysis import CustomAnalyzer

# 1.创建个ES连接
connections.create_connection(hosts=['127.0.0.1'])

# 3.自定义分词器
class MyAnalyzer(CustomAnalyzer):
    def get_analysis_definition(self):
        return {}

# 创建分析器对象   filter 忽略大小写
ik_analyzer = MyAnalyzer('ik_max_word',filter=['lowercase'])

# 2.创建数据Model
class NovelModel(DocType):
    # 2.1普通字段
    title = Text(analyzer='ik_max_word')
    author = Text(analyzer='ik_max_word')
    classify = Text()
    rate = Text()
    collect = Integer()
    number = Text()
    time = Text()
    click_week = Integer()
    click_month = Integer()
    click_all = Integer()
    collect_week = Integer()
    collect_month = Integer()
    collect_all = Integer()
    abstract = Text()
    picture = Text()
    download_url = Text()
    # 2.2搜索建议字段
    suggest = Completion(analyzer=ik_analyzer)
    # 2.3创建Meta
    class Meta:
        # index 索引名(数据库)
        index = 'alldata'
        # doc_type 类型(表名称)
        doc_type = 'novel'

if __name__ == '__main__':
    NovelModel.init()

写一个Pipeline来存储数据

因为考虑到一个爬虫项目可能不止一个爬虫，每个爬虫的Item又不一样，所以在每一个Item类中来进行写入储存操作，然后每次当Item交给Pipeline来处理的时候，会根据不同的Item来进行不同的处理操作。
```
|-pipelines文件
class ToEsPipeline(object):
    def process_item(self,item,spider):
    item.save_to_es()
    return item
```

编写Item

import scrapy
from elasticsearch_dsl.connections import connections
from .es_model import NovelModel

# 1.创建连接，获得连接对象
es = connections.create_connection(hosts=['http://39.107.255.196'])

# 3.处理搜索意见分词
def process_suggest(index,*args):
        '''
        :param index: index 索引(数据库)
        :param args: 需要进行分词的内容
        :return: 返回分词之后的列表,不允许有重复的数据
        '''
        #创建一个空集合
        use_words = set()
        #声明搜索建议分词列表
        suggest = []
        for text,weight in args:
            # text 需要分词的文本
            # weight 权重
            # 调用es的分词analyzer接口进行分词
            words = es.indices.analyze(
                # es索引(数据库)
                index = index,
                analyzer='ik_max_word',
                # 其他参数,顾虑器
                params={
                    'filter':['lowercase'],
                },
                body={
                    'text':text
                }
            )
            # 列表生成式 并转换set集合进行去重
            analyzer_words = set([dic['token'] for dic in words['tokens']])
            new_words = analyzer_words - use_words
            #把没有重复的数据追加到列表
            suggest.append({'input':list(new_words),'weight':weight})
            use_words = analyzer_words

        return suggest

# 2.处理Item
class MyItem(scrapy.Item):
    novel_classify = scrapy.Field()
    novel_title = scrapy.Field()
    novel_author = scrapy.Field()
    novel_rate = scrapy.Field()
    novel_collect = scrapy.Field()
    novel_number = scrapy.Field()
    novel_time = scrapy.Field()
    click_all = scrapy.Field()
    click_month = scrapy.Field()
    click_week = scrapy.Field()
    collect_all = scrapy.Field()
    collect_month = scrapy.Field()
    collect_week = scrapy.Field()
    novel_abstract = scrapy.Field()
    novel_picture = scrapy.Field()
    novel_download = scrapy.Field()

    # 2.创建保存方法
    def save_to_es(self):
        # 2.1创建Novel数据Model对象
        novel = NovelModel()
        # 2.2普通字段赋值
        novel.title = self['novel_title']
        novel.author = self['novel_author']
        novel.classify = self['novel_classify']
        novel.rate = self['novel_rate']
        novel.collect = self['novel_collect']
        novel.number = self['novel_number']
        novel.time = self['novel_time']
        novel.click_week = self['click_week']
        novel.click_month = self['click_month']
        novel.click_all = self['click_all']
        novel.collect_week = self['collect_week']
        novel.collect_month = self['collect_month']
        novel.collect_all = self['collect_all']
        novel.bstract = self['novel_abstract']
        novel.picture = self['novel_picture']
        novel.download_url = self['novel_download']
        # 2.3搜索建议
        novel.suggest = process_suggest(NovelModel._doc_type.index,(novel.title,10),(novel.author,8))
        # 2.4保存
        novel.save()

3.Django项目

由于在Django项目中也会用到我们在scrapy爬虫项目中的model.py文件，所以复制一份到django项目中

import math
from redis import Redis
from urllib import parse
from datetime import datetime
from django.shortcuts import render, redirect
from django.http import JsonResponse
from elasticsearch_dsl.connections import connections

from .es_models.es_types import NovelModel

rds = Redis(host='127.0.0.1',port=6379)
es = connections.create_connection(hosts=['127.0.0.1'])

def index(request):
    # 定义搜搜哦数据的类型
    navs = [
        {'type': 'novel', 'title': '小说'},
        {'type': 'movie', 'title': '电影'},
        {'type': 'job', 'title': '职位'},
        {'type': 'news', 'title': '新闻'},
    ]

    content = {
        'navs': navs,
        'search_type': 'novel'
    }

    if request.method == 'GET':
        return render(request, 'index.html', content)


def result(request):
    if request.method == 'GET':
        # 取出关键词,类型
        keyword = request.GET.get('kw')
        s_type = request.GET.get('s_type')
        # 如果没有页码参数,默认为1
        page_num = request.GET.get('pn', 1)
        # 如果没有搜索关键词,重定向到主页
        if not keyword:
            return redirect('index')

        rds.zincrby('hotkey',keyword)
        hot_top5 = rds.zrevrange('hotkey',0,5)

        history = request.COOKIES.get('history',None)
        cookie_str = ''
        if history:
            cookies = history.split(',')
            if parse.quote(keyword) in cookies:
                cookies.remove(parse.quote(keyword))
            cookies.insert(0,parse.quote(keyword))
            if len(cookies) > 5:
                cookies.pop()
            cookie_str = ','.join(cookies)
        else:
            cookies = []
            cookie_str = parse.quote(keyword)

        # 判断搜索类型
        if s_type == 'novel':
            # 1.搜索的索引
            index = 'alldata'
            # 2.type名
            doc_type = 'novel'
            # 3.获取数据字段
            fields = ['title', 'bstract']
            start_time = datetime.now()
            rs = es.search(
                index=index,
                doc_type=doc_type,
                body={
                    "query": {
                        "multi_match": {
                            "query": keyword,
                            "fields": fields
                        }
                    },
                    "from": (int(page_num) - 1) * 10,
                    "size": 10,
                    'highlight': {
                        'pre_tags': ['<span class="keyWord">'],
                        "post_tags": ['</span>'],
                        "fields": {
                            "title": {},
                            "bstract": {}
                        }
                    }
                }
            )
            use_time = (datetime.now() - start_time).total_seconds()
            hits_list = []
            for hit in rs['hits']['hits']:
                h_dic = {}
                if 'title' in hit['highlight'].keys():
                    h_dic['title'] = hit['highlight']['title'][0]
                else:
                    h_dic['title'] = hit['_source']['title']
                if 'bstract' in hit['highlight'].keys():
                    h_dic['abstract'] = hit['highlight']['bstract']
                else:
                    h_dic['abstract'] = hit['_source']['bstract']
                h_dic['detail_url'] = hit['_source']['download_url'][0]

                hits_list.append(h_dic)

            navs = [
                {'type': 'novel', 'title': '博客'},
                {'type': 'job', 'title': '职位'},
                {'type': 'movie', 'title': '电影'},
                {'type': 'news', 'title': '新闻'},
            ]

            # 总记录条数
            totle = rs['hits']['total']
            # 页数,向上取证
            page_nums = math.ceil(totle / 10)

            page_num = int(page_num)
            if page_num - 4 <= 0:
                pages = range(1, 11)
            elif page_num + 5 >= page_nums:
                pages = range(page_nums - 9, page_nums + 1)
            else:
                pages = range(page_num - 4, page_num + 6)


            content = {
                'hits': hits_list,
                'kw': keyword,
                'use_time': use_time,
                'total': totle,
                'page_nums': page_nums,
                'navs': navs,
                'search_type': s_type,
                'pages': pages,
                'history':[his for his in  parse.unquote(cookie_str).split(',')],
                'hot_top5':hot_top5
            }
        response = render(request,'result.html',content)
        response.set_cookie('history',cookie_str)

        return response


def suggest(request):
    if request.method == 'GET':
        # 取出搜索内容、类型
        s = request.GET.get('s', None)
        s_type = request.GET.get('s_type')
        content = {}
        if s:
            # 去ES中根据搜索关键词、搜索类型
            datas = get_suggest(s, s_type)
            content['status'] = 0
            content['datas'] = datas
            content['s_type'] = s_type
            if len(datas) == 0:
                content['status'] = -1
        else:
            content['status'] = -1

        return JsonResponse(content)


# 在es中搜索数据
def get_suggest(keyword, s_type):
    '''
    :param keyword: 搜索关键词
    :param s_type: 搜索类型
    :return: 搜索结果
    '''
    # 创建一个search对象用于搜索
    if s_type == 'novel':
        search = NovelModel.search()
    elif s_type == 'job':
        pass
    # suggest()获取搜索建议的接口
    # 1.自定义搜索结果对应的key
    # 2.搜索关键词
    result = search.suggest(
        'r_suggest',
        keyword,
        completion={
            'field': 'suggest',
            'fuzzy': {
                'fuzziness': 2
            },
            'size': 5
        }
    )
    # s返回一个字典
    s = result.execute_suggest()
    fileds = {'novel': 'title'}
    # 定义一个结果列表
    datas = []
    for dic in s['r_suggest'][0]['options']:
        sug = dic._source[fileds[s_type]]
        datas.append(sug)

    # 返回搜索建议
    return datas

mankvis

关注

5
点赞
踩
41

收藏

觉得还不错? 一键收藏
1
评论
Python基于Elasticsearch实现搜索引擎

＆nbsp; ＆NBSP; ＆NBSP; ＆NBSP; ElasticSearch是一个基于Lucene的搜索服务器。它提供了一个分布式多用户能力的全文搜索引擎，基于RESTful Web接口.Elasticsearch是用Java开发的，并作为Apache许可条款的开放源码发布，是当前流行的企业级搜索引擎设计用于云计算中，能够达到实时搜索，稳定，可靠，快速，安装使用方便。1.准备...
复制链接

扫一扫