python 调用ElasterSearch来完成全文检索实战

最新推荐文章于 2024-07-25 18:31:33 发布

qq_33404619

最新推荐文章于 2024-07-25 18:31:33 发布

阅读量609

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/qq_33404619/article/details/104520015

版权

python 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

一.数据源：es的数据源可以是数据库，文档， json文件，以及爬虫爬取的

二.前提准备：
1.安装es,详细请参考 es安装教程

2.安装kibana,详细请参考 kibana安装教程

3.安装ik分词库，详细请参考https://blog.csdn.net/weixin_44062339/article/details/85006948

1.首先在es目录的pelugins 文件夹下创建ik文件夹

2.拷贝ik的压缩包到文件夹中
3.解压ik压缩包到当前文件夹（切记不要解压到其他，只选中解压到当前文件夹，不然es会报错）

# coding:utf8
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import json
import time


class ElasticObj:
    def __init__(self, index_name, index_type, ip):
        """
        :param index_name: 索引名称
        :param index_type: 索引类型
        """
        self.index_name = index_name
        self.index_type = index_type
        # 无用户名密码状态
        self.es = Elasticsearch([ip])
        # 用户名密码状态
        # self.es = Elasticsearch([ip],http_auth=('elastic', 'password'),port=9200)

    # 创建索引
    def create_index(self):
        '''
        创建索引,创建索引名称为ott，类型为ott_type的索引
        :param ex: Elasticsearch对象
        :return:
        '''
        # 创建映射
        _index_mappings = {
            "mappings": {
                self.index_type: {
                    "properties": {
                        "id": {
                            "type": "long",
                            "index": "false"

                        },
                        "substance": {
                            "type": "text",
                            "index": True,
                            'analyzer': 'ik_max_word',
                            'search_analyzer': 'ik_max_word'
                        },
                        "title": {
                            'type': 'text',
                            "index": True,
                            'analyzer': 'ik_max_word',
                            'search_analyzer': 'ik_max_word'

                        }
                    }
                }

            }
        }
        if self.es.indices.exists(index=self.index_name) is not True:
            res = self.es.indices.create(index=self.index_name, body=_index_mappings, ignore=400)
            print(res)

    # 往ES里面插入数据
    def insert_data(self, path1):
        # 读取数据源写入es
        with open(path1, 'r+', encoding="utf8") as fr:
            load_text = json.load(fr)
            context = load_text["RECORDS"]

        ACTIONS = []
        i = 1
        bulk_num = 2000
        for line in context:
            action = {
                "_index": self.index_name,
                "_type": self.index_type,
                "_id": i,  # _id 也可以默认生成，不赋值
                "_source": {
                    "id": line['_id']['$oid'],
                    "substance": line['strContent'],
                    "title": line['strTitle']}
            }
            i += 1
            ACTIONS.append(action)
            # 批量处理
            if len(ACTIONS) == bulk_num:
                print('插入', i / bulk_num, '批数据')
                print(len(ACTIONS))
                success, _ = bulk(self.es, ACTIONS, index=self.index_name, raise_on_error=True)
                del ACTIONS[0:len(ACTIONS)]
                print(success)

        if len(ACTIONS) > 0:
            success, _ = bulk(self.es, ACTIONS, index=self.index_name, raise_on_error=True)
            del ACTIONS[0:len(ACTIONS)]
            print('Performed %d actions' % success)

    # 删除es里面的索引
    def Delete_Index(self, index_name):
        result = self.es.indices.delete(index=index_name, ignore=[400, 404])
        print(result)

    # 把es里面的数据删除掉，这里的id是es里面的_id
    def Delete_Index_Data(self, id):
        '''
        删除索引中的一条
        :param id:
        :return:
        '''
        res = self.es.delete(index=self.index_name, doc_type=self.index_type, id=id)
        print(res)

    def Get_Data_Id(self, id):

        res = self.es.get(index=self.index_name, doc_type=self.index_type, id=id)
        print(res['_source'])

        print('------------------------------------------------------------------')
        #
        # # 输出查询到的结果
        # for hit in res['hits']['hits']:
        #     # print hit['_source']
        print(res['_source']['id'], res['_source']['substance'], res['_source']['title'])

    # 更新es里面的内容
    def Update_Index_Data(self, id, data):
        '''
        更新索引中的一条
        :param id:
        :return:
        '''
        result = self.es.update(index='news', doc_type='politics', body=data, id=id)
        print(result)

    # 查询es里面的数据
    def seacrh_Data(self, index_name, type_name):
        result = self.es.search(index=index_name, doc_type=type_name)
        print(result)

    def Get_Data_By_Body(self, search_info):
        # doc = {'query': {'match_all': {}}}

        doc = {
            "query": {
                "match": {
                    "substance": search_info
                }
            }
        }
        result = self.es.search(index=self.index_name, doc_type=self.index_type, body=doc)

        print(json.dumps(result, indent=2, ensure_ascii=False))

    def Find_Data_BY_key(self):
        mapping = {
            'properties': {
                'title': {
                    'type': 'text',
                    'analyzer': 'ik_max_word',
                    'search_analyzer': 'ik_max_word'
                }
            }
        }
        self.es.indices.delete(index='news', ignore=[400, 404])
        self.es.indices.create(index='news', ignore=400)
        result = self.es.indices.put_mapping(index='news', doc_type='politics', body=mapping)
        print(result)


if __name__ == '__main__':
    obj = ElasticObj("test1", "en", ip="127.0.0.1")
    # obj.create_index()
    # path = r"C:\Users\Administrator\PycharmProjects\python_learn\elasterSerach\dedu_suncn_news.json"
    # obj.insert_data(path)
    # obj.Delete_Index_Data(7)
    # obj.Get_Data_Id(1)
    start = time.time()
    find_news = "香港"
    obj.Get_Data_By_Body(find_news)
    end = time.time()
    print(end - start)