使用requests发请求操作Elasticsearch【二】

zyooooxie
已于 2024-01-23 22:25:02 修改
阅读量657
点赞数 6
分类专栏：数据库学习文章标签： elasticsearch python requests
于 2024-01-05 22:21:54 首次发布
本文为博主原创，未经许可严禁转载。
本文链接：https://blog.csdn.net/zyooooxie/article/details/118367832
版权
数据库学习专栏收录该内容
8 篇文章 0 订阅
订阅专栏
本文为博主原创，未经授权，严禁转载及使用。
本文链接：https://blog.csdn.net/zyooooxie/article/details/118367832
前面刚刚分享使用requests发请求操作Elasticsearch【一】，继续分享下。
【实际这篇博客推迟发布N个月】
个人博客：https://blog.csdn.net/zyooooxie
【以下所有内容仅为个人项目经历，如有不同，纯属正常】
Document APIs

https://www.elastic.co/guide/en/elasticsearch/reference/7.17/docs.html
"""
@blog: https://blog.csdn.net/zyooooxie
@qq: 153132336
@email: zyooooxie@gmail.com
"""

import json
import random
import time
import string
import copy
import requests

from requests_toolbelt.utils import dump
from requests import api
from XXX.common_es import gl_es_auth, gl_es_host_new
from XXX.common_functions import one_choice_true_false

from user_log import Log

# 一个文档不仅仅包含它的数据 ，也包含 元数据（有关 文档的信息）。 三个必须的元数据元素如下：
# _index    文档在哪存放
# _type 文档表示的对象类别
# _id   文档唯一标识

gl_index = 'ABC-data'
gl_type = '_doc'

gl_int = random.randint(1, 999)
gl_id = 'test' + str(time.time())

def es_send_request(request_method: str, request_url: str, data_dict: dict,
                    auth: tuple = gl_es_auth, **kwargs) -> requests.Response:
    """

    :param request_method:
    :param request_url:
    :param data_dict:
    :param auth:
    :param kwargs:
    :return:
    """
    assert hasattr(api, request_method) is True

    res = requests.request(method=request_method, url=request_url, json=data_dict, auth=auth, **kwargs)
    Log.info(dump.dump_all(res).decode('utf-8'))

    res.close()
    Log.info('********')

    return res


def test_index_doc(document_id):
    """

    :param document_id:
    :return:
    """
    Log.info(document_id)

    # 提供自定义的 _id 值
    # PUT /{index}/{type}/{id}

    url = '/'.join([gl_es_host_new, gl_index, gl_type, document_id])
    data_dict = {'test': gl_int * 1, 'text': {'content': document_id + '是id'}}

    res = es_send_request('put', url, data_dict, gl_es_auth)
    assert res.json().get('_id') == document_id

    test_get_doc(document_id)

    # 在 Elasticsearch 中每个文档都有一个版本号。当每次对文档进行修改时（包括删除）， _version 的值会递增。
    Log.info(res.json().get('_version'))

    Log.info('********')

    # 让 index API 自动生成
    # POST /{index}/{type}/

    url = '/'.join([gl_es_host_new, gl_index, gl_type])

    data_ = {'test': gl_int * 10000, 'text': {'content': 'id是es自动生成的'}}

    res = es_send_request('post', url, data_)
    assert res.json().get('result') == 'created'

    # _id 是 Elasticsearch 自动生成的
    id_ = res.json().get('_id')
    Log.info(res.json().get('_id'))

    test_get_doc(id_)


def test_create_doc(document_id):
    """

    :param document_id:
    :return:
    """
    # 当我们索引一个文档，怎么确认我们正在创建一个完全新的文档，而不是覆盖现有的呢？
    # 如果已经有自己的 _id ，那么我们必须告诉 Elasticsearch ，只有在相同的 _index 、 _type 和 _id 不存在时，才接受我们的索引请求

    # 1.使用 op_type
    # PUT  /{index}/{type}/{id}?op_type=create

    url = '/'.join([gl_es_host_new, gl_index, gl_type, document_id + '?op_type=create'])
    data_dict = {'test': str(gl_int)}
    res = es_send_request('put', url, data_dict)
    Log.info(res.status_code)

    Log.info('********')

    new_id_ = document_id if one_choice_true_false else document_id + gl_id
    Log.info(new_id_)

    # 2.在 URL 末端使用 /_create
    # PUT /{index}/{type}/{id}/_create
    # POST /{index}/{type}/{id}/_create

    url = '/'.join([gl_es_host_new, gl_index, gl_type, new_id_, '_create'])
    res = es_send_request('put', url, data_dict)
    Log.info(res.status_code)

    res = es_send_request('post', url, data_dict)
    Log.info(res.status_code)

    # 如果具有相同的 _index 、 _type 和 _id 的文档已经存在，Elasticsearch 将会返回 409 Conflict 响应码
    # 如果创建新文档的请求成功执行，Elasticsearch 会返回  201 Created 的 HTTP 响应码。


def test_get_doc(document_id: str):
    """

    :param document_id:
    :return:
    """
    # 执行 一个 HTTP GET 请求并指定文档的地址——索引库、类型和ID
    # GET /{index}/{type}/{id}

    url = '/'.join([gl_es_host_new, gl_index, gl_type, document_id])
    res = es_send_request('get', url, {})

    Log.info('********')

    # 加上 pretty 参数，这将会调用 Elasticsearch 的 pretty-print 功能，该功能 使得 JSON 响应体更加可读    【调试使用】
    # GET /{index}/{type}/{id}?pretty

    url = '/'.join([gl_es_host_new, gl_index, gl_type, document_id + '?pretty'])
    # res = es_send_request('get', url, {})

    Log.info(res.json())
    found_ = res.json().get('found')
    assert res.status_code == (404 if found_ is False else 200)
    # GET 请求的响应体包括 {"found": true} ，这证实了文档已经被找到。
    # 如果我们请求一个不存在的文档，我们仍旧会得到一个 JSON 响应体，但是 found 将会是 false 。
    # 此外， HTTP 响应码将会是 404 Not Found ，而不是 200 OK 。

    return res


def test_get_doc2(document_id: str):

    """
    
    :param document_id: 
    :return: 
    """
    # 默认情况下， GET 请求会返回整个文档，这个文档正如存储在 _source 字段中的一样。但是也许你只对其中的 title 字段感兴趣。
    # 单个字段能用 _source 参数请求得到，多个字段也能使用逗号分隔的列表来指定。

    # GET /{index}/{type}/{id}?_source=title,text
    url = '/'.join([gl_es_host_new, gl_index, gl_type, document_id + '?_source={}'.format('seq')])
    es_send_request('get', url, {})

    url = '/'.join([gl_es_host_new, gl_index, gl_type, document_id + '?_source={}'.format('seq,text')])
    es_send_request('get', url, {})

    # _source_includes
    url = '/'.join([gl_es_host_new, gl_index, gl_type, document_id + '?_source_includes={}'.format('seq')])
    es_send_request('get', url, {})

    url = '/'.join([gl_es_host_new, gl_index, gl_type, document_id + '?_source_includes={}'.format('seq,text')])
    es_send_request('get', url, {})

    Log.info('********')

    # _source_excludes
    url = '/'.join([gl_es_host_new, gl_index, gl_type, document_id + '?_source_excludes={}'.format('seq')])
    es_send_request('get', url, {})

    url = '/'.join([gl_es_host_new, gl_index, gl_type, document_id + '?_source_excludes={}'.format('seq,text')])
    es_send_request('get', url, {})

    Log.info('********')

    # 只想得到 _source 字段，不需要任何元数据，你能使用 _source
    # GET /{index}/{type}/{id}/_source
    url = '/'.join([gl_es_host_new, gl_index, gl_type, document_id, '_source'])
    es_send_request('get', url, {})


def test_doc_exists(document_id: str):

    """
    
    :param document_id: 
    :return: 
    """
    # 如果只想检查一个文档是否存在--根本不想关心内容，那么用 HEAD 方法来代替 GET 方法。
    # HEAD 请求没有返回体，只返回一个 HTTP 请求报头
    url = '/'.join([gl_es_host_new, gl_index, gl_type, document_id])

    res = es_send_request('head', url, {})
    Log.info(res.status_code)

    # 如果文档存在， Elasticsearch 将返回一个 200 ok 的状态码
    # 若文档不存在， Elasticsearch 将返回一个 404 Not Found 的状态码


def test_update_doc(document_id: str):
    """
    更新整个文档
    :param document_id:
    :return:
    """
    res = test_get_doc(document_id)
    _version = res.json().get('_version')
    Log.info(_version)

    # 更新整个文档
    # PUT /{index}/{type}/{id}

    url = '/'.join([gl_es_host_new, gl_index, gl_type, document_id])
    data_dict = {'abc': gl_int}
    res = es_send_request('put', url, data_dict, auth=gl_es_auth)
    Log.info(res.status_code)

    # 【若此id存在】    status_code是200
    # 【若此id不存在】  创建  status_code是201

    assert res.status_code == 200 if _version is not None else 201
    assert res.json().get('_version') == (_version if _version is not None else 0) + 1

    # 在内部，Elasticsearch 已将旧文档标记为已删除，并增加一个全新的文档。 尽管你不能再对旧版本的文档进行访问，但它并不会立即消失。当继续索引更多的数据，Elasticsearch 会在后台清理这些已删除文档。
    test_get_doc(document_id)


def test_partial_updates(document_id):
    """
    文档的部分更新
    :param document_id:
    :return:
    """
    res = test_get_doc(document_id)
    _version = res.json().get('_version')
    Log.info(_version)

    # 文档的部分更新
    # 作为 doc 的参数， 它只是与现有的文档进行合并。对象被合并到一起，覆盖现有的字段，增加新的字段

    # POST /{index}/{type}/{id}/_update
    url = '/'.join([gl_es_host_new, gl_index, gl_type, document_id, '_update'])

    data = {"doc": {random.choice(string.ascii_letters): 1, "test": str(gl_int)}}  # 已有字段要保证 字段值类型一致
    res = es_send_request('post', url, data)
    Log.info(res.status_code)

    assert res.status_code == 200 if _version is not None else 404

    Log.info('********')

    # POST /{index}/_update/{id}        没有 {type}
    url = '/'.join([gl_es_host_new, gl_index, '_update', document_id])
    data = {"doc": {random.choice(string.ascii_letters): 1, "test": str(gl_int)}}  # 已有字段要保证 字段值类型一致

    res = es_send_request('post', url, data)
    Log.info(res.status_code)

    assert res.status_code == 200 if _version is not None else 404

    # 【若此id存在】   status_code是200
    # 【若此id不存在】 不做更新 + 不创建  status_code是404

    test_get_doc(document_id)


def test_bulk(document_id):
    """
    
    :param document_id: 
    :return: 
    """
    # https://www.elastic.co/guide/en/elasticsearch/reference/7.17/docs-bulk.html#docs-bulk-api-desc
    # https://www.elastic.co/guide/cn/elasticsearch/guide/current/bulk.html

    test_get_doc(document_id)

    # bulk API 允许在单个步骤中进行多次 create 、 index 、 update 或 delete 请求。

    # bulk 与其他的请求体格式稍有不同，格式：

    # { action: { metadata }}\n
    # { request body        }\n
    # { action: { metadata }}\n
    # { request body        }\n
    # ...

    # 有2个要点：    每行一定要以换行符(\n)结尾，包括最后一行；     这些行不能包含未转义的换行符。

    # action/metadata行 指定 哪一个文档 做 什么操作 。

    # action 必须是以下选项之一:
    # create 如果文档不存在，那么就创建它。
    # index 创建一个新文档或者替换一个现有的文档。
    # update 部分更新一个文档。
    # delete 删除一个文档。

    # metadata 应该指定被索引、创建、更新或者删除的文档的 _index 、 _type 和 _id 。

    # request body 行由文档的 _source 本身组成 文档包含的字段和值。
    # 它是 index 和 create 操作所必需的。
    # 它也是 update 操作所必需的，应该包含你传递给 update API 的相同请求体： doc 、 upsert 、 script 等等；
    # delete 操作不需要 request body 行。

    metadata = {'_index': gl_index, '_type': gl_type, '_id': document_id}

    # POST /_bulk
    url = '/'.join([gl_es_host_new, '_bulk'])

    bulk_action_body(metadata=metadata, url=url)

    test_get_doc(document_id)

    Log.info('********')
    Log.info('********')
    Log.info('********')

    # 不重复指定Index和Type
    # 在 bulk 请求的 URL 中接收默认的 /_index 或者 /_index/_type

    document_id_ = document_id + '_id'
    test_get_doc(document_id_)

    metadata_new = {'_id': document_id_}

    # POST /{index}/{type}/_bulk
    url_new = '/'.join([gl_es_host_new, gl_index, gl_type, '_bulk'])
    bulk_action_body(metadata=metadata_new, url=url_new)

    test_get_doc(document_id_)


def bulk_action_body(metadata: dict, url: str):
    """
    
    :param metadata: 
    :param url: 
    :return: 
    """
    # 整个批量请求都需要由接收到请求的节点加载到内存中，因此该请求越大，其他请求所能获得的内存就越少。 批量请求的大小有一个最佳值，
    Log.info(metadata)

    random_int = random.randint(-999, -1)
    id_old = metadata.get('_id')
    Log.info(f'{random_int}, {id_old}')

    Log.info('********')

    create = {'create': metadata.copy()}
    c_body = {'test': random_int * 1, 'text': {'content': f'id是{id_old}'}}
    create_ele = create, c_body, id_old

    Log.info(metadata)
    Log.info(create_ele)

    Log.info('********')

    # 不指定id（将会自动生成一个ID）
    metadata.pop('_id')

    index_1 = {'index': metadata.copy()}
    i_body_1 = {'test': random_int * 100, 'text': {'content': 'id是es生成的'}}
    index_ele_1 = index_1, i_body_1, None

    Log.info(metadata)
    Log.info(index_ele_1)

    # 指定id
    id_new = id_old + random.choice(string.ascii_letters)
    metadata.update(_id=id_new)

    index_2 = {'index': metadata.copy()}
    i_body_2 = {'test': random_int * 10000, 'text': {'content': f'id是{id_new}，多了个字母'}}
    index_ele_2 = index_2, i_body_2, id_new

    Log.info(metadata)
    Log.info(index_ele_2)

    Log.info('********')

    update = {'update': metadata.copy()}
    u_body = {'doc': {'test': random_int * 0.01, 'new': 'update的新值'}}
    update_ele = update, u_body, id_new

    Log.info(metadata)
    Log.info(update_ele)

    Log.info('********')

    delete = {'delete': metadata.copy()}
    d_body = None  # delete操作 不需要 request body 行。
    delete_ele = delete, d_body, id_new

    Log.info(metadata)
    Log.info(delete_ele)

    Log.info('********')

    # 调转 delete_ele 的位置
    # a_b_id = create_ele, index_ele_1, index_ele_2, update_ele, delete_ele
    a_b_id = create_ele, index_ele_1, index_ele_2, delete_ele, update_ele

    all_data = str()

    for action_, body_, id_ in a_b_id:
        # 每个单独 bulk

        if body_:
            request_data = '{}\n{}\n'.format(json.dumps(action_), json.dumps(body_))

        else:
            request_data = '{}\n'.format(json.dumps(action_))

        Log.info(request_data)

        bulk_send_request(req_data=request_data, req_url=url)

        if id_:
            test_get_doc(id_)

        all_data += request_data

    else:
        # 全部 bulk

        Log.info(all_data)

        # 每个子请求都是独立执行，因此某个子请求的失败不会对其他子请求的成功与否造成影响。
        # 如果其中任何子请求失败，最顶层的 error 标志被设置为 true ，并且在相应的请求报告出错误明细

        bulk_send_request(req_data=all_data, req_url=url)

        test_get_doc(id_old)
        test_get_doc(id_new)


def bulk_send_request(req_data: str, req_url: str):
    """
    
    :param req_data: 
    :param req_url: 
    :return: 
    """
    
    # When sending NDJSON data to the _bulk endpoint, use a Content-Type header of application/json or application/x-ndjson.

    h = {'Content-Type': 'application/json;charset=utf-8'}

    res = requests.post(req_url, data=req_data, auth=gl_es_auth, headers=h)

    Log.info(dump.dump_all(res).decode('utf-8'))

    time.sleep(1)

    Log.error('🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀')


def test_delete_doc(document_id: str):
    """

    :param document_id:
    :return:
    """
    # DELETE /{index}/{type}/{id}
    url = '/'.join([gl_es_host_new, gl_index, gl_type, document_id])

    res = es_send_request('delete', url, {})

    # 如果找到该文档，Elasticsearch 将要返回一个 200 ok 的 HTTP 响应码
    # 如果文档没有找到，我们将得到 404 Not Found 的响应码
    Log.info(res.status_code)


def create_source(docs_list: list):
    """

    :param docs_list:
    :return:
    """

    Log.info(docs_list)

    # 如果你想检索一个或者多个特定的字段，那么你可以通过 _source 参数来指定这些字段的名字：

    # By default, the _source field is returned for every document (if stored). Use the _source and _source_include or source_exclude attributes to filter what fields are returned for a particular document.
    # You can include the _source, _source_includes, and _source_excludes query parameters in the request URI to specify the defaults to use when there are no per-document instructions.
    dl_copy = copy.deepcopy(docs_list)

    for dl in dl_copy:
        # dl.update(_source=random.sample(['abc', 'test', 'text'], k=2)) if random.getrandbits(1) else dl.update(
        #     _source='abc')

        abc = random.getrandbits(2)

        if not abc:
            dl.update(_source=random.sample(['abc0', 'msgType', 'text'], k=2))

        elif abc == 1:
            dl.update(_source='seq')

        elif abc == 2:
            dl.update(_source={'include': ['abc2', 'text', 'msgTime']})

        else:
            dl.update(_source={'include': ['abc3', 'text', 'msgId'], 'exclude': ['text.content']})

    Log.info(dl_copy)

    return dl_copy


def test_multi_get(document_id_list: list):
    """
    取回多个文档
    :param document_id_list:
    :return:
    """

    # mget API 要求有一个 docs 数组作为参数，每个元素包含需要检索文档的元数据， 包括 _index 、 _type 和 _id 。

    # GET /_mget
    url1 = '/'.join([gl_es_host_new, '_mget'])

    docs_1 = [{'_index': gl_index, '_type': gl_type, '_id': di} for di in document_id_list]

    data1 = {'docs': docs_1}
    data1_ = {'docs': create_source(docs_1)}

    Log.info('********')

    # 如果想检索的数据都在相同的 _index 中（甚至相同的 _type 中），则可以在 URL 中指定默认的 /_index 或者默认的 /_index/_type

    # GET /{index}/_mget
    url2 = '/'.join([gl_es_host_new, gl_index, '_mget'])

    docs_2 = [{'_id': di, '_type': gl_type} for di in document_id_list]

    data2 = {'docs': docs_2}
    data2_ = {'docs': create_source(docs_2)}

    # GET /{index}/{type}/_mget

    url3 = '/'.join([gl_es_host_new, gl_index, gl_type, '_mget'])
    docs_3 = [{'_id': di} for di in document_id_list]

    data3 = {'docs': docs_3}
    data3_ = {'docs': create_source(docs_3)}

    Log.info('********')

    # 所有文档的 _index 和 _type 都是相同的，你可以只传一个 ids 数组，而不是整个 docs 数组
    data3_new = {'ids': document_id_list}

    Log.info('********')

    for u, d in zip((url1, url1, url2, url2, url3, url3, url3),
                    (data1, data1_, data2, data2_, data3, data3_, data3_new)):
        # 对于每一个在请求中指定的文档，这个数组中都包含有一个对应的响应，且顺序与请求中的顺序相同。
        # 其中的每一个响应都和使用单个 get request 请求所得到的响应体相同
        es_send_request('get', u, d)

        time.sleep(1)


def test_count():
    """
    
    :return: 
    """
    # POST /{index}/{type}/_count
    url = '/'.join([gl_es_host_new, gl_index, gl_type, '_count'])

    es_send_request('post', url, {})


def test_mapping():
    """
    
    :return: 
    """
    # 通过 /_mapping ，我们可以查看 Elasticsearch 在一个或多个索引中的一个或多个类型的映射

    # GET /{index}/_mapping
    url = '/'.join([gl_es_host_new, gl_index, '_mapping'])
    es_send_request('get', url, {})

    # GET /{index}/_mapping/{type}
    url = '/'.join([gl_es_host_new, gl_index, '_mapping', gl_type])
    es_send_request('get', url, {}, params={'include_type_name': 'true'})


if __name__ == '__main__':
    # 映射（Mapping）   描述数据在每个字段内如何存储
    # 分析（Analysis）  全文是如何处理使之可以被搜索的
    # 领域特定查询语言（Query DSL）   Elasticsearch 中强大灵活的查询语言

    Log.info(gl_int)
    Log.info(gl_id)