实战：Elastic Search+python中的json至少要掌握的几个操作

最新推荐文章于 2024-09-12 07:50:44 发布

skywuuuu

最新推荐文章于 2024-09-12 07:50:44 发布

阅读量593

点赞数 1

分类专栏：实习生涯文章标签： python elasticsearch json

本文链接：https://blog.csdn.net/skywuuu/article/details/116052927

版权

实习生涯专栏收录该内容

15 篇文章 0 订阅

订阅专栏

第一次使用遇到的一些问题

关于kibana

端口不是连5601,5601是kibana的，9200才是elasticsearch本身的端口
如何查找某个属性

POST company_attributes/_search
{}

如何删除某个属性（小心使得万年船，千万别误删了）

DELETE test

关于json

记得先导入包

import json

json.load()
将json格式的字符串转为python中的字典，配合open可以读入json文件
json.dump()
将python中的字典转为son格式的字符串，配合write可以写出json文件

关于elastic search

参考官方文档，搜索Elasticsearch

连接

es = Elasticsearch(['192.168.1.25:9200']
               	   # sniff before doing anything
                   # sniff_on_start=True,
                   # refresh nodes after a node fails to respond
                   # sniff_on_connection_fail=True,
                   # and also every 60 seconds
                   # sniffer_timeout=60
                   )  # 连接elasticsearch的集群，可以存放多个IP地址，其他的是从官网上赋值下来的一些属性的设置，可以加上
print(es.ping()) # 如果连接成功了，这里es.ping()会返回true

创建索引

es.indices.create(index=index, ignore=400) # 创建索引，400是重复添加索引，可以使用ignore属性忽略

增加（插入）

es.create(index=index, doc_type='doc', id=ID, body=subdata, ignore=409) # 插入元素，409报错是重复添加id，可以使用ignore属性忽略

删除

result = es.delete(index=index, doc_type='doc', id=123) # 指定id来删除，可以print result出来查看
# print(result)

改变（更新）

es.update(index=index, doc_type='doc', body=data, id=1) # 指定id并用在data中的数据来更新原本数据

查询

es.search() # 具体看一下文档，我没有使用过

实战代码

将所需的json文件导入elastic search的数据库中

from elasticsearch import Elasticsearch
import os
import json
import random
import re


def get_ids(data_collection):
    ids = []
    for data in data_collection:
        ids.append(data['entityId']) # 使用entityId作为elasticsearch的_id
        del data['entityId']  # 用完就删掉
    # print(ids)
    return ids


def get_file_name(base_path):
    file_collection = []
    for dir, subDir, files in os.walk(base_path):
        # print(dir,'\t', subDir,'\t', files)
        for file in files:
            in_path = os.path.join(dir, file)  # 替换 in_path = dir + '/' + file
            if file.endswith('json'):  # 筛选后缀为json的文件
                file_collection.append(in_path)
    # print(file_collection)
    return file_collection


def read_json_file(file_collection):
    data_collection = []
    # 读取数据
    for i, file in enumerate(file_collection):
        with open(file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        file = []
        # 根据不同类别从json格式数据中筛选所需的项，我的文件里有5个，但我只要序号0和3
        if i == 0:
            for company in data:
                location = re.findall(r'(?<=[）])[\u4e00-\u9fa5]+(?=[市])', company['address']) # location要放市的信息，格式是在“）”和“市”之间有城市的名字
                if len(location) == 0:
                    location = ['']
                # print(location)
                new_data = {'entityId': company['_id'],
                            'companyName': company['companyName'],
                            'companyType': company['companyType'],
                            'createDate': company['createDate'],
                            'representative': company['representative'],
                            'mainBusiness': company['mainBusiness'],
                            'industry': company['industry'],
                            'address': company['address'],
                            'marketValue': company['marketValue'],
                            'alias': company['alias'],
                            'value': 0,
                            'location': location[0]
                            }
                file.append(new_data)
        elif i == 3:
            for person in data:
                location = re.findall(r'(?<=[）])[\u4e00-\u9fa5]+(?=[市])', person['address']) # location要放市的信息，格式是在“）”和“市”之间有城市的名字
                if len(location) == 0:
                    location = ['']
                # print(location)
                new_data = {'entityId': person['entityId'],
                            'name': person['name'],
                            'wiki_url': person['url'],
                            'FB_url': '',
                            'gender': person['gender'],
                            'alias': person['alias'],
                            'birth': person['birth'],
                            'death': person['death'],
                            'address': person['address'],
                            'education_background': '',
                            'work_background': '',
                            'profession': person['profession'],
                            'party': person['party'],
                            'location': location[0],
                            'introduction': '',
                            'event': ''
                            }
                # print(new_data)
                file.append(new_data)
        data_collection.append(file)
    # print(data_collection)
    return data_collection


# 随手定义的函数，后续可以继续封装dump
def dict2json(Dict):
    return json.dumps(Dict)


if __name__ == '__main__':
    indices = ['company_attributes',
               'person_attributes'
               ] # 两个索引的名称
    es = Elasticsearch(['192.168.1.25:9200'] # 注意端口是9200
                       # sniff before doing anything
                       # sniff_on_start=True,
                       # refresh nodes after a node fails to respond
                       # sniff_on_connection_fail=True,
                       # and also every 60 seconds
                       # sniffer_timeout=60
                       )  # 连接elasticsearch的集群，可以存放多个IP地址，其他的是从官网上赋值下来的一些属性的设置，可以加上
    print(es.ping()) # 如果连接成功了，这里es.ping()会返回true

    base_path = r'分析平台系统研发\entity'
    file_collection = get_file_name(base_path)  # 0: aAttributes.json, 1: bAttributes.json,
    # 2: cAttributes.json, 3: dAttributes.json, 4: eAttributes.json
    data_collection = read_json_file(file_collection)
    data_collection = [data_collection[0], data_collection[3]] # 因为我们只提取了这两个，所以这边要手动提取一下，反正就是把空的drop了
    
    for index in indices:
        es.indices.create(index=index, ignore=400) # 创建索引，400是重复添加索引，可以使用ignore属性忽略

    for index, data in zip(indices, data_collection):
        ids = get_ids(data)
        for ID, subdata in zip(ids, data):
            print(index, ' ', ID, ' ', subdata)
            es.create(index=index, doc_type='doc', id=ID, body=subdata, ignore=409) # 插入元素，409报错是重复添加id，可以使用ignore属性忽略