将文件中数据更新至es(先删除旧数据, 再添加新数据)

最新推荐文章于 2024-05-17 16:05:44 发布
一勺菠萝丶
最新推荐文章于 2024-05-17 16:05:44 发布
阅读量769
点赞数
分类专栏： # ELK Python
本文链接：https://blog.csdn.net/weixin_39973810/article/details/125083463
版权
elasticsearch python 大数据
Python 同时被 2 个专栏收录
14 篇文章 1 订阅
订阅专栏
ELK
8 篇文章 0 订阅
订阅专栏
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import json
import logging
import os
import re
import time

from logging import handlers
from elasticsearch import Elasticsearch, helpers

syn_file_path = input("Please enter the address of the synchronization file, example('/data/txtjson/fmsq-update): ")
syn_method_name = input("Please enter a name for the synchronization method, example('FMSQ')for logging: ")
syn_es_ip = input("Please enter the IP of ES to be synchronized, example('221.194.47.208')")
syn_es_index_name = input("Please enter the name of the ES library to be synchronized, example('index_china'): ")
syn_es_count_once = input("Please enter the number of times to synchronize, example('1000'): ")

# syn_file_path = ''
# syn_method_name = ''
# syn_es_ip = ''
# syn_es_index_name = ''
# syn_es_count_once = ''
#
# if syn_file_path == '':
#     syn_file_path = r'F:\专利索引相关文档\2022_04_08_测试同步数据\20220201\FM'
# if syn_method_name == '':
#     syn_method_name = "FM"
# if syn_es_ip == '':
#     syn_es_ip = '221.194.47.208'
# if syn_es_index_name == '':
#     syn_es_index_name = 'index_china'

if not syn_es_count_once.isdigit():
    syn_es_count_once = 500
else:
    syn_es_count_once = int(syn_es_count_once)

es = Elasticsearch(syn_es_ip, port=9200)
del_count = 0

log_file = f'./log/2022_04_08/{syn_method_name}'
check_file = os.path.exists(log_file)
if check_file:
    print("log file is exit, begin syn txt to es============================>")
else:
    os.makedirs(log_file)


class Logger(object):
    level_relations = {
        'debug': logging.DEBUG,
        'info': logging.INFO,
        'warning': logging.WARNING,
        'error': logging.ERROR,
        'crit': logging.CRITICAL
    }

    def __init__(self, filename, level='info', when='D', backCount=3,
                 fmt='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s'):
        self.logger = logging.getLogger(filename)
        format_str = logging.Formatter(fmt)
        self.logger.setLevel(self.level_relations.get(level))
        sh = logging.StreamHandler()
        sh.setFormatter(format_str)
        th = handlers.TimedRotatingFileHandler(filename=filename, when=when, backupCount=backCount, encoding='utf-8')

        th.setFormatter(format_str)
        self.logger.addHandler(sh)
        self.logger.addHandler(th)


log_all_path = f'{log_file}/syn_mongo_to_es_all.log'
log_error_path = f'{log_file}/syn_mongo_to_es_error.log'
log = Logger(log_all_path, level='debug')
logError = Logger(log_error_path, level='error')


# 函数将驼峰大小写转换为蛇形大小写
def camel_to_snake(string):
    string = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', string)
    string = re.sub('(.)([0-9]+)', r'\1_\2', string)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', string).lower()


# 从新封装list
def init_lst(lst):
    result = {}
    for k, v in lst.items():
        if k == '_id':
            result[k] = str(v)
        else:
            result[camel_to_snake(k)] = v
    return result


def delete_by_lst(lst, file_num):
    # pass
    for open_on in lst:
        delete_by_id(open_on, file_num)


def syn_txt_to_es_v2():
    log.logger.info(f"syn_file_path: {syn_file_path}")
    log.logger.info(f"syn_method_name: {syn_method_name}")
    log.logger.info(f"syn_es_ip: {syn_es_ip}")
    log.logger.info(f"syn_es_index_name: {syn_es_index_name}")

    # file count
    count = 0
    data_count = 0
    delete_open_on_list = []
    data_insert_list = []

    # check user input
    if syn_method_name == '' or syn_es_ip == '' or syn_es_index_name == '':
        logError.logger.error("The parameter cannot be empty")
        return
    if syn_file_path == '':
        logError.logger.error("path error...")
        return

    # read file
    files = os.listdir(syn_file_path)
    for file_name in files:
        # Init data
        data_list = []
        sync_data_lst = []
        count += 1

        file_path = os.path.join(syn_file_path, file_name)
        log.logger.info("file_path: " + file_path)
        # Check whether it is a file
        if not os.path.isdir(file_path):
            log.logger.info(f"read {count} file:{file_name} begin  ============================>")
            # Read file data and save to data_list, For assembling data
            with open(file_path, "r", encoding="utf_8_sig") as f:
                load_dict = json.load(f)
                load_type = type(load_dict)
                if load_type.__name__ == 'list':
                    for load in load_dict:
                        data_list.append(load)
                else:
                    logError.logger.error("Error file data, please check file file data...")

            log.logger.info(f"The {count} files start assembling data============================>")
            for i in data_list:
                data_count += 1
                record = {}
                i = init_lst(i)
                # 取出想要的值
                open_no = i.get("open_no")
                delete_open_on_list.append(open_no)
                applicant = i.get("applicant")
                if applicant is not None:
                    record['applicant'] = applicant.split(";")
                inventor = i.get("inventor")
                if inventor is not None:
                    record['inventor'] = inventor.split(";")
                ipc = i.get("ipc")
                if ipc is not None:
                    record['ipc'] = ipc.split(";")
                record['_id'] = i.get("_id")
                record['app_no'] = i.get('app_no')
                record['app_date'] = i.get('app_date')
                record['app_country'] = i.get('app_country')
                record['open_no'] = i.get('open_no')
                record['open_date'] = i.get('open_date')
                record['first_applicant'] = i.get('first_applicant')
                record['applicant_address'] = i.get('applicant_address')
                record['first_inventor'] = i.get('first_inventor')
                record['title'] = i.get('title')
                record['abstract'] = i.get('abstract')
                record['abstract_translation'] = i.get('abstract_translation')
                record['first_claim'] = i.get('first_claim')
                record['main_ipc'] = i.get('main_ipc')
                record['main_ipc_depart'] = i.get('main_ipc_depart')
                record['main_ipc_category'] = i.get('main_ipc_category')
                record['main_ipc_class'] = i.get('main_ipc_class')
                record['main_ipc_large_group'] = i.get('main_ipc_large_group')
                record['main_ipc_small_group'] = i.get('main_ipc_small_group')
                record['agent'] = i.get('agent')
                record['agent_organ'] = i.get('agent_organ')
                record['unagent'] = i.get('unagent')
                record['unagent_organ'] = i.get('unagent_organ')
                record['priority'] = i.get('priority')
                record['title_fr'] = i.get('title_fr')
                record['title_de'] = i.get('title_de')
                record['abstract_er'] = i.get('abstract_er')
                record['abstract_de'] = i.get('abstract_de')
                record['country_code'] = i.get('country_code')
                record['title_cn'] = i.get('title_cn')
                record['main_law_status'] = i.get('main_law_status')
                record['sub_law_status'] = i.get('sub_law_status')
                record['postal_code'] = i.get('postal_code')
                record['i_application'] = i.get('i_application')
                record['i_publication'] = i.get('i_publication')
                record['date_entry'] = i.get('date_entry')
                record['db_name'] = syn_method_name
                data_insert_list.append(record)
                if data_count % 1000 == 0:
                    log.logger.info(
                        f"The {count} file start assembling data open_on: {open_no}  count: {data_count} finish============================>")
            log.logger.info(f"The {count} file already end assembling data ============================>")
            if len(delete_open_on_list):
                log.logger.info(f"The {count} file delete old begin============================>")
                delete_by_lst(delete_open_on_list, count)
                delete_open_on_list = []
                log.logger.info(f"The {count} file delete old finish============================>")

                for record in data_insert_list:
                    record.pop('url', '')
                    record.pop('description', '')
                    record.pop('claim', '')
                    id_value = record.pop('_id')
                    sync_data_lst.append({
                        "_index": syn_es_index_name,  # mongodb数据库 == Es的index
                        "_id": id_value,
                        "_source": record,
                    })
                    if len(sync_data_lst) == syn_es_count_once:
                        helpers.bulk(es, sync_data_lst)
                        sync_data_lst[:] = []
                        log.logger.info(f"{syn_method_name} sync {count} records to elasticsearch")
            if sync_data_lst:
                helpers.bulk(es, sync_data_lst)
                log.logger.info(f"{syn_method_name} sync {count} records to elasticsearch")
            log.logger.info(f"Success {count} sync {syn_method_name} finish============================>")
            data_insert_list = []

        else:
            logError.logger.error("Error reading file, please check file path format...")
    log.logger.info(f"All File Success {count} sync {syn_method_name} finish============================>")


def delete_by_id(open_no, file_num):
    global del_count
    del_count += 1
    body = {"query": {"match": {"open_no": open_no}}}
    temp_count = 0
    while temp_count < 2:
        temp_count += 1
        try:
            es.delete_by_query(index=syn_es_index_name, body=body)
            if del_count % 1000 == 0:
                log.logger.info(f"The {file_num} file es delete, open_no: {open_no} delete {del_count} end")
            return
        except:
            log.logger.info(f"The {file_num} file es delete, open_no: {open_no} delete {del_count} is error")
            time.sleep(3)
    logError.logger.error(f"The {file_num} file es delete, open_no: {open_no} delete {del_count} is error...")


if __name__ == '__main__':
    syn_txt_to_es_v2()