同步txt中的数据到mongo中

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging
import os
import re
import datetime
from logging import handlers

from elasticsearch import Elasticsearch, helpers
from pymongo import MongoClient

syn_mongo_url = input("Please enter Mongo url to sync, example('192.168.0.234:27017'): ")
syn_mongo_username = input("Please enter Mongo username to sync, example('admin'): ")
syn_mongo_password = input("Please enter Mongo password to sync, example('123456'): ")
syn_mongo_DB = input("Please enter DB name to synchronize from Mongo, example('FMSQ'): ")
syn_mongo_COLLECTION = input("Please enter the DOCUMENT name to synchronize to Mongo, example('patents'): ")

syn_es_ip = input("Please enter the IP of ES to be synchronized, example('221.194.47.208')")
# 总共要同步的数量(1同步所有, 2每天同步1w条)
syn_count_total = input("Please enter the maximum number of syncs per day('1 all, 2 1w'): ")
syn_count_one_time = input("Please enter the number of syn at a time, example('5000'): ")
syn_es_name = input("Please enter the name of the ES library to be synchronized, example('index_chia'): ")


# 测试用
if syn_mongo_url == '':
    syn_mongo_url = '221.194.47.208:26006'
# if syn_mongo_username == '':
#     syn_mongo_username = 'admin'
# if syn_mongo_password == '':
#     syn_mongo_password = '123456'
if syn_mongo_DB == '':
    syn_mongo_DB = 'FMSQ'
if syn_mongo_COLLECTION == '':
    syn_mongo_COLLECTION = 'patents'
if syn_es_ip == '':
    syn_es_ip = '221.194.47.208'
if syn_es_name == '':
    syn_es_name = 'index_china'

# 默认值
if not syn_count_total.isdigit():
    syn_count_total = 1
else:
    syn_count_total = int(syn_count_total)
    if syn_count_total != 1:
        syn_count_total = 10000
if not syn_count_one_time.isdigit():
    syn_count_one_time = 5000
else:
    syn_count_one_time = int(syn_count_one_time)

log_file = f'./log/syn_mongo_to_es_by_open_date/{syn_mongo_DB}'
check_file = os.path.exists(log_file)
if check_file:
    print("log file is exit, begin syn mongo to es============================>")
else:
    os.makedirs(log_file)


class Logger(object):
    level_relations = {
        'debug': logging.DEBUG,
        'info': logging.INFO,
        'warning': logging.WARNING,
        'error': logging.ERROR,
        'crit': logging.CRITICAL
    }

    def __init__(self, filename, level='info', when='D', backCount=3,
                 fmt='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s'):
        self.logger = logging.getLogger(filename)
        format_str = logging.Formatter(fmt)
        self.logger.setLevel(self.level_relations.get(level))
        sh = logging.StreamHandler()
        sh.setFormatter(format_str)
        th = handlers.TimedRotatingFileHandler(filename=filename, when=when, backupCount=backCount, encoding='utf-8')

        th.setFormatter(format_str)
        self.logger.addHandler(sh)
        self.logger.addHandler(th)


log_all_path = f'{log_file}/syn_mongo_to_es_all.log'
log_error_path = f'{log_file}/syn_mongo_to_es_error.log'
log = Logger(log_all_path, level='debug')
Logger(log_error_path, level='error').logger.error("error")


# 本地mongo
# mongo_url = '192.168.0.234:27017'
# USERNAME = 'admin'
# PASSWORD = '123456'
# DB_name = 'FMSQ'


# 函数将驼峰大小写转换为蛇形大小写
def camel_to_snake(string):
    string = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', string)
    string = re.sub('(.)([0-9]+)', r'\1_\2', string)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', string).lower()


# 从新封装list
def init_lst(lst):
    result = {}
    for k, v in lst.items():
        if k == '_id':
            result[k] = str(v)
        else:
            result[camel_to_snake(k)] = v
    return result


if syn_mongo_username == '' or syn_mongo_password == '':
    client = MongoClient(syn_mongo_url)
else:
    client = MongoClient(syn_mongo_url, username=syn_mongo_username, password=syn_mongo_password)


def syn_mongo_to_es():
    log.logger.info(f"syn_mongo_url: {syn_mongo_url}")
    log.logger.info(f"syn_mongo_username: {syn_mongo_username}")
    log.logger.info(f"syn_mongo_password: {syn_mongo_password}")
    log.logger.info(f"syn_mongo_DB: {syn_mongo_DB}")
    log.logger.info(f"syn_mongo_COLLECTION: {syn_mongo_COLLECTION}")
    log.logger.info(f"syn_es_ip: {syn_es_ip}")
    log.logger.info(f"syn_count_total: {syn_count_total}")
    log.logger.info(f"syn_count_one_time: {syn_count_one_time}")
    if syn_es_name == '':
        log.logger.error("syn_es_name not allowed to be empty")
        return

    # 是否查询到的状态
    state = 0
    # 初始日期
    initial_open_date = '1985.09.10'

    # mongodb数据读取
    db = client[syn_mongo_DB]
    collection = db['patents']

    es = Elasticsearch(syn_es_ip, port=9200)
    while True:
        with open('./syn_open_date/main.txt', mode="r", encoding='utf-8') as f1, \
                open('./syn_open_date/main_copy.txt', mode='w', encoding="utf-8") as f2, \
                open(f'./syn_open_date/{syn_mongo_DB}_data.txt', mode="a+", encoding="utf-8") as f3:
            for line in f1:
                line = line.strip()
                lst = line.split(' ')
                file_mongo_db = lst[0]
                file_open_date = lst[1]

                if state == 0 and file_mongo_db == syn_mongo_DB:
                    state = 1

                    # 获取当前时间, 例`2022.01.13`
                    date_format_now = datetime.datetime.strftime(datetime.datetime.now(), '%Y.%m.%d')
                    if date_format_now != file_open_date:
                        # mongo查询数据插入es
                        query = {"OpenDate": f"{file_open_date}"}

                        docs = []
                        query_list = collection.find(query)
                        query_list = list(query_list)
                        query_len = len(query_list)
                        if query_len != 0:
                            f3.write(f"{file_mongo_db} {file_open_date} {query_len}")
                            f3.write('\n')
                        count = 0

                        # 封装要插入es中的数据
                        init_es_data(count, docs, es, query_list, file_mongo_db, file_open_date)

                        # 将时间格式化并加一天, 并记录当下时间
                        file_open_date_new = datetime.datetime.strptime(file_open_date, '%Y.%m.%d')
                        file_open_date_new = (file_open_date_new + datetime.timedelta(days=1)).strftime("%Y.%m.%d")
                        line = line.replace(file_open_date, file_open_date_new)
                        log.logger.info(f"syn mongo data to es {file_mongo_db}-{file_open_date} finish")
                        f2.write(line)
                        f2.write('\n')
                    else:
                        state = 2
                        f2.write(line)
                        f2.write('\n')
                        break
                else:
                    f2.write(line)
                    f2.write('\n')

            if state == 0:
                f2.write(f'{syn_mongo_DB} {initial_open_date}')
                f2.write('\n')
                log.logger.info(f"init {syn_mongo_DB}-{initial_open_date} open_date begin")
            if state == 2:
                log.logger.info(f"All success syn {syn_mongo_DB} open_date finish")
                break
            if state == 1:
                state = 0

        os.remove('./syn_open_date/main.txt')
        os.rename('./syn_open_date/main_copy.txt', './syn_open_date/main.txt')

    client.close()
    log.logger.info("All data syn es success============================>")


# 封装同步到es中的数据格式
def init_es_data(count, docs, es, query_list, file_mongo_db, file_open_date):
    for i in query_list:
        record = {}
        # 初始化数据和计数
        count += 1
        i = init_lst(i)

        # 取出想要的值
        open_no = i.get("open_no")
        applicant = i.get("applicant")
        if applicant is not None:
            record['applicant'] = applicant.split(";")
        inventor = i.get("inventor")
        if inventor is not None:
            record['inventor'] = inventor.split(";")
        ipc = i.get("ipc")
        if ipc is not None:
            record['ipc'] = ipc.split(";")
        record['app_no'] = i.get('app_no')
        record['app_date'] = i.get('app_date')
        record['app_country'] = i.get('app_country')
        record['open_no'] = i.get('open_no')
        record['open_date'] = i.get('open_date')
        record['first_applicant'] = i.get('first_applicant')
        record['applicant_address'] = i.get('applicant_address')
        record['first_inventor'] = i.get('first_inventor')
        record['title'] = i.get('title')
        record['abstract'] = i.get('abstract')
        record['abstract_translation'] = i.get('abstract_translation')
        record['first_claim'] = i.get('first_claim')
        record['main_ipc'] = i.get('main_ipc')
        record['main_ipc_depart'] = i.get('main_ipc_depart')
        record['main_ipc_category'] = i.get('main_ipc_category')
        record['main_ipc_class'] = i.get('main_ipc_class')
        record['main_ipc_large_group'] = i.get('main_ipc_large_group')
        record['main_ipc_small_group'] = i.get('main_ipc_small_group')
        record['agent'] = i.get('agent')
        record['agent_organ'] = i.get('agent_organ')
        record['unagent'] = i.get('unagent')
        record['unagent_organ'] = i.get('unagent_organ')
        record['priority'] = i.get('priority')
        record['title_fr'] = i.get('title_fr')
        record['title_de'] = i.get('title_de')
        record['abstract_er'] = i.get('abstract_er')
        record['abstract_de'] = i.get('abstract_de')
        record['country_code'] = i.get('country_code')
        record['title_cn'] = i.get('title_cn')
        record['main_law_status'] = i.get('main_law_status')
        record['sub_law_status'] = i.get('sub_law_status')
        record['postal_code'] = i.get('postal_code')
        record['i_application'] = i.get('i_application')
        record['i_publication'] = i.get('i_publication')
        record['date_entry'] = i.get('date_entry')
        record['db_name'] = syn_mongo_DB

        if count % 1000 == 0:
            log.logger.info(
                f"init record {file_mongo_db}-{file_open_date}, open_on: {open_no}  count: {count} finish============================>")

        docs.append({
            "_index": syn_es_name,  # mongodb数据库 == Es的index
            "_id": i.pop('_id'),
            "_source": record,
        })
        if syn_count_total != 1:
            if count >= syn_count_total:
                break
        if len(docs) >= syn_count_one_time:
            # 批量插入
            helpers.bulk(es, docs, request_timeout=1800)
            del docs[:]
            log.logger.info(f"{file_mongo_db}-{file_open_date} sync {count} records to elasticsearch")

    # 批量写入es,honor_data/tmp
    if docs:
        helpers.bulk(es, docs, request_timeout=1800)
        del docs[:]
        log.logger.info(f"{file_mongo_db}-{file_open_date} sync {count} records to elasticsearch")


if __name__ == '__main__':
    syn_mongo_to_es()


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值