ElasticSearch常用操作记录

最新推荐文章于 2024-07-26 19:30:00 发布

Hugo5332

最新推荐文章于 2024-07-26 19:30:00 发布

阅读量322

点赞数

本文链接：https://blog.csdn.net/xxxslinyue/article/details/106558197

版权

ES使用记录

数据导入

# -*- coding:utf-8 -*-

import pandas as pd
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import argparse


class ElasticObj:
    def __init__(self, index_name, index_type, ip="127.0.0.1"):
        '''
        构建es索引，批量导入数据
        '''
        self.index_name = index_name
        self.index_type = index_type
        self.es = Elasticsearch([ip])

    def bulk_Index_Data(self, csvfile):
        '''
        用bulk将批量数据存储到es
        '''
        df = open(csvfile, 'r', encoding='utf8')
        lines = df.readlines()
        print(len(lines))

        doc = []
        for item in lines:
            it = item.split('\t')
            dic = {}
            dic['docid'] = it[0]
            dic['passage'] = it[3].strip().replace(' ', '')
            doc.append(dic)
        ACTIONS = []
        i = 0
        for line in doc:
            action = {
                "_index": self.index_name,
                "_type": self.index_type,
                "_source": {
                    "docid": line['docid'],
                    "passage": line['passage']}
            }
            i += 1
            ACTIONS.append(action)
        print('index_num:', i)
        success, _ = bulk(self.es, ACTIONS, index=self.index_name, raise_on_error=True, request_timeout=1000)
        print('Performed %d actions' % success)

    def create_index(self, index_name, index_type):
        '''
        创建索引
        '''
        # 创建映射
        _index_mappings = {
            "mappings": {
                "properties": {
                    "passage": {
                        "type": "text",
                        "analyzer": "ik_max_word",
                        "search_analyzer": "ik_max_word"
                    },
                    "docid": {
                        "type": "text"
                    }
                }
            }
        }
        # 构建索引
        print("begin construct index")
        if self.es.indices.exists(index=self.index_name) is not True:
            res = self.es.indices.create(index=self.index_name, body=_index_mappings)
            print(res)
        else:
            print("exists")


if __name__ == "__main__":
    #建立ES，把文档批量导入索引节点
    obj = ElasticObj("passage","_doc")
    print("init done!")
    obj.create_index("passage","_doc")
    print("create done!")
    obj.bulk_Index_Data("./datasets/law/new_doc.txt")

数据查询

# -*- coding:utf-8 -*-

import pandas as pd
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
class ElasticObj:
    """
    es索引类，用于检索数据
    """
    def __init__(self, index_name,index_type,ip ="127.0.0.1"):
        self.index_name =index_name
        self.index_type = index_type
        self.es = Elasticsearch([ip])

    """
    通过问题检索文档
    """
    def Get_Data_By_Body(self, question, k):
        doc = {
            "size": k,
            "query": {
                "match": {
                  "passage": question
                }
              }
        }
        try:
            _searched = self.es.search(index=self.index_name, doc_type=self.index_type, body=doc)
            answers = []
            for item in _searched['hits']['hits']:
                answers.append((item['_source']['passage'][:20], item['_source']['docid']))
            return answers

        except:
            print('search not exist')
            print(question)
def search(query,topk):
    obj = ElasticObj("passage", "_doc")
    answers = obj.Get_Data_By_Body(query, topk)
    return answers
if __name__ == "__main__":
    
    query = ''
    print(search(query, 3))