Elasticsearch operation by python

最新推荐文章于 2024-08-29 11:00:00 发布

allen sue

最新推荐文章于 2024-08-29 11:00:00 发布

阅读量514

点赞数

分类专栏： # nosql 文章标签： nosql elastic

本文链接：https://blog.csdn.net/fish2009122/article/details/89384831

版权

nosql 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

读写删与批量post

from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from utils.hash_data import get_hash_id
from config import config
from utils.read_url import requst_url_json


class OperaEs(object):
    def __init__(self):
        self.url = 'http://{}:{}'.format(config.ES_IP, config.ES_PORT)
        self.es = Elasticsearch([self.url], timeout=300)

    def post_data(self, es_index, es_id, data):
        # post signal data
        if isinstance(es_id, str):
            es_id = get_hash_id(data=es_id)
        res = self.es.create(index=es_index, doc_type=es_index, id=es_id, body=data, ignore=400)
        if res.get('result') == 'created':
            return 'success', 201
        else:
            return 'put fail', 400

    def post_batch(self, arr, es_index):
        def gendata():
            for data in arr:
                if self.read_by_es_id(es_index=es_index, es_id=data.get('id')):
                    continue
                yield {
                    "_index": es_index,
                    "_type": es_index,
                    "_id": data.get('id'), # 该参数可以为默认 非默认情况下可以判断数据是否重复
                    "doc": data,
                }
        bulk(self.es, gendata())

    def read_by_es_id(self, es_index, es_id):
        # read by es_id
        if isinstance(es_id, str):
            es_id = get_hash_id(data=es_id)
        read_url = '{}/{}/{}/{}'.format(self.url, es_index, es_index, es_id)
        req = requst_url_json(read_url)
        return req.get('_source') if req else None

    def read_by_body(self, es_index, body):
        res = self.es.search(index=es_index, doc_type=es_index, body=body)
        return res.get('hits').get('hits') if res.get('hits') else None

    def delete_index(self, index):
        self.es.indices.delete(index=index, ignore=[400, 404])

复杂query

通过 kibana 的 dev tools 的 console 来 translate
直接query：返回结果集

POST /_xpack/sql?format=txt
{
    "query": """
    SELECT count(1),date
    FROM "spider-fang"
    where province = '上海市'
    and city = '上海市'
    and area = '普陀区'
    group by date    
    """
}

translate：返回 body

POST /_xpack/sql/translate
{
    "query": """
    SELECT count(1),date
    FROM "spider-fang"
    where province = '上海市'
    and city = '上海市'
    and area = '普陀区'
    group by date
    
    """
}

FAQ

Limit of total fields [1000] in index

 curl -XPUT "172.16.15.105:9200/jd-region-code/_settings" -H "Content-Type:application/json" -d "{"""index.mapping.total_fields.limit""": 50000}"

以上代码若用一个双引号，会报错

Could not resolve host

中文字段无法filter（filter为null的问题）
现象：es中，中文字段，match时，非精确匹配，出现很多数据
使用filter term过滤时，返回结果为null

解决方法：
将match 替换为 match_phrase
Fielddata is disabled on text fields by default. Set fielddata=true
现象

Fielddata is disabled on text fields by default. Set fielddata=true
"type": "illegal_argument_exception","reason": "Fielddata is disabled on text fields by default. Set fielddata=true on [region] in order to load fielddata in memory by uninverting the inverted index. Note that this can however use significant memory."

解决方法：

set index/_mapping/type_name region to “fielddata”: ture

put xytest/_mapping/sutdent  (http://localhost:9200/index_data/_mapping/index_data)

{
  "properties": {
    "region":{
      "type": "text",
      "fielddata": true
    }
  }
}

# local
{
  "properties": {
    "code":{
      "type": "text",
      "fielddata": true
    }
  }
}

在相关field中庸keyword即可

field="country.keyword"  

body = {
  "size": 0,
  "aggs": {
    "province_count": {
      "terms": { "field": "province.keyword" ,
                              "size": 1000000},  # 注意 返回的doc_count_error_upper_bound（最大错误数） 和 sum_other_doc_count （错误数总和）

          "aggs": {
        "industry_count": {
              "terms": { "field": "city.keyword" },

          "aggs": {
            "industry_count": {
              "terms": {"field": "area.keyword"}
            }
          }

                 }
               }

        }
      }
    }

res = es.search(index='spider-fang', doc_type='spider-fang', body=body)
res_data = res['aggregations']['province_count']['buckets']

Result window is too large
现象：ES提示返回结果集窗口太大了，目前最大值为10000，而返回结果的body size > 10000

解决方法：

curl -XPUT "192.168.246.28:9200/jd-judicial/_settings" -H "Content-Type:application/json" -d "{ """index""" : { """max_result_window""" : 100000000}}"