#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging
import os
import re
import datetime
from logging import handlers
from elasticsearch import Elasticsearch, helpers
from pymongo import MongoClient
syn_mongo_url = input("Please enter Mongo url to sync, example('192.168.0.234:27017'): ")
syn_mongo_username = input("Please enter Mongo username to sync, example('admin'): ")
syn_mongo_password = input("Please enter Mongo password to sync, example('123456'): ")
syn_mongo_DB = input("Please enter DB name to synchronize from Mongo, example('FMSQ'): ")
syn_mongo_COLLECTION = input("Please enter the DOCUMENT name to synchronize to Mongo, example('patents'): ")
syn_es_ip = input("Please enter the IP of ES to be synchronized, example('221.194.47.208')")
# 总共要同步的数量(1同步所有, 2每天同步1w条)
syn_count_total = input("Please enter the maximum number of syncs per day('1 all, 2 1w'): ")
syn_count_one_time = input("Please enter the number of syn at a time, example('5000'): ")
syn_es_name = input("Please enter the name of the ES library to be synchronized, example('index_chia'): ")
# 测试用
if syn_mongo_url == '':
syn_mongo_url = '221.194.47.208:26006'
# if syn_mongo_username == '':
# syn_mongo_username = 'admin'
# if syn_mongo_password == '':
# syn_mongo_password = '123456'
if syn_mongo_DB == '':
syn_mongo_DB = 'FMSQ'
if syn_mongo_COLLECTION == '':
syn_mongo_COLLECTION = 'patents'
if syn_es_ip == '':
syn_es_ip = '221.194.47.208'
if syn_es_name == '':
syn_es_name = 'index_china'
# 默认值
if not syn_count_total.isdigit():
syn_count_total = 1
else:
syn_count_total = int(syn_count_total)
if syn_count_total != 1:
syn_count_total = 10000
if not syn_count_one_time.isdigit():
syn_count_one_time = 5000
else:
syn_count_one_time = int(syn_count_one_time)
log_file = f'./log/syn_mongo_to_es_by_open_date/{syn_mongo_DB}'
check_file = os.path.exists(log_file)
if check_file:
print("log file is exit, begin syn mongo to es============================>")
else:
os.makedirs(log_file)
class Logger(object):
level_relations = {
'debug': logging.DEBUG,
'info': logging.INFO,
'warning': logging.WARNING,
'error': logging.ERROR,
'crit': logging.CRITICAL
}
def __init__(self, filename, level='info', when='D', backCount=3,
fmt='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s'):
self.logger = logging.getLogger(filename)
format_str = logging.Formatter(fmt)
self.logger.setLevel(self.level_relations.get(level))
sh = logging.StreamHandler()
sh.setFormatter(format_str)
th = handlers.TimedRotatingFileHandler(filename=filename, when=when, backupCount=backCount, encoding='utf-8')
th.setFormatter(format_str)
self.logger.addHandler(sh)
self.logger.addHandler(th)
log_all_path = f'{log_file}/syn_mongo_to_es_all.log'
log_error_path = f'{log_file}/syn_mongo_to_es_error.log'
log = Logger(log_all_path, level='debug')
Logger(log_error_path, level='error').logger.error("error")
# 本地mongo
# mongo_url = '192.168.0.234:27017'
# USERNAME = 'admin'
# PASSWORD = '123456'
# DB_name = 'FMSQ'
# 函数将驼峰大小写转换为蛇形大小写
def camel_to_snake(string):
string = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', string)
string = re.sub('(.)([0-9]+)', r'\1_\2', string)
return re.sub('([a-z0-9])([A-Z])', r'\1_\2', string).lower()
# 从新封装list
def init_lst(lst):
result = {}
for k, v in lst.items():
if k == '_id':
result[k] = str(v)
else:
result[camel_to_snake(k)] = v
return result
if syn_mongo_username == '' or syn_mongo_password == '':
client = MongoClient(syn_mongo_url)
else:
client = MongoClient(syn_mongo_url, username=syn_mongo_username, password=syn_mongo_password)
def syn_mongo_to_es():
log.logger.info(f"syn_mongo_url: {syn_mongo_url}")
log.logger.info(f"syn_mongo_username: {syn_mongo_username}")
log.logger.info(f"syn_mongo_password: {syn_mongo_password}")
log.logger.info(f"syn_mongo_DB: {syn_mongo_DB}")
log.logger.info(f"syn_mongo_COLLECTION: {syn_mongo_COLLECTION}")
log.logger.info(f"syn_es_ip: {syn_es_ip}")
log.logger.info(f"syn_count_total: {syn_count_total}")
log.logger.info(f"syn_count_one_time: {syn_count_one_time}")
if syn_es_name == '':
log.logger.error("syn_es_name not allowed to be empty")
return
# 是否查询到的状态
state = 0
# 初始日期
initial_open_date = '1985.09.10'
# mongodb数据读取
db = client[syn_mongo_DB]
collection = db['patents']
es = Elasticsearch(syn_es_ip, port=9200)
while True:
with open('./syn_open_date/main.txt', mode="r", encoding='utf-8') as f1, \
open('./syn_open_date/main_copy.txt', mode='w', encoding="utf-8") as f2, \
open(f'./syn_open_date/{syn_mongo_DB}_data.txt', mode="a+", encoding="utf-8") as f3:
for line in f1:
line = line.strip()
lst = line.split(' ')
file_mongo_db = lst[0]
file_open_date = lst[1]
if state == 0 and file_mongo_db == syn_mongo_DB:
state = 1
# 获取当前时间, 例`2022.01.13`
date_format_now = datetime.datetime.strftime(datetime.datetime.now(), '%Y.%m.%d')
if date_format_now != file_open_date:
# mongo查询数据插入es
query = {"OpenDate": f"{file_open_date}"}
docs = []
query_list = collection.find(query)
query_list = list(query_list)
query_len = len(query_list)
if query_len != 0:
f3.write(f"{file_mongo_db} {file_open_date} {query_len}")
f3.write('\n')
count = 0
# 封装要插入es中的数据
init_es_data(count, docs, es, query_list, file_mongo_db, file_open_date)
# 将时间格式化并加一天, 并记录当下时间
file_open_date_new = datetime.datetime.strptime(file_open_date, '%Y.%m.%d')
file_open_date_new = (file_open_date_new + datetime.timedelta(days=1)).strftime("%Y.%m.%d")
line = line.replace(file_open_date, file_open_date_new)
log.logger.info(f"syn mongo data to es {file_mongo_db}-{file_open_date} finish")
f2.write(line)
f2.write('\n')
else:
state = 2
f2.write(line)
f2.write('\n')
break
else:
f2.write(line)
f2.write('\n')
if state == 0:
f2.write(f'{syn_mongo_DB} {initial_open_date}')
f2.write('\n')
log.logger.info(f"init {syn_mongo_DB}-{initial_open_date} open_date begin")
if state == 2:
log.logger.info(f"All success syn {syn_mongo_DB} open_date finish")
break
if state == 1:
state = 0
os.remove('./syn_open_date/main.txt')
os.rename('./syn_open_date/main_copy.txt', './syn_open_date/main.txt')
client.close()
log.logger.info("All data syn es success============================>")
# 封装同步到es中的数据格式
def init_es_data(count, docs, es, query_list, file_mongo_db, file_open_date):
for i in query_list:
record = {}
# 初始化数据和计数
count += 1
i = init_lst(i)
# 取出想要的值
open_no = i.get("open_no")
applicant = i.get("applicant")
if applicant is not None:
record['applicant'] = applicant.split(";")
inventor = i.get("inventor")
if inventor is not None:
record['inventor'] = inventor.split(";")
ipc = i.get("ipc")
if ipc is not None:
record['ipc'] = ipc.split(";")
record['app_no'] = i.get('app_no')
record['app_date'] = i.get('app_date')
record['app_country'] = i.get('app_country')
record['open_no'] = i.get('open_no')
record['open_date'] = i.get('open_date')
record['first_applicant'] = i.get('first_applicant')
record['applicant_address'] = i.get('applicant_address')
record['first_inventor'] = i.get('first_inventor')
record['title'] = i.get('title')
record['abstract'] = i.get('abstract')
record['abstract_translation'] = i.get('abstract_translation')
record['first_claim'] = i.get('first_claim')
record['main_ipc'] = i.get('main_ipc')
record['main_ipc_depart'] = i.get('main_ipc_depart')
record['main_ipc_category'] = i.get('main_ipc_category')
record['main_ipc_class'] = i.get('main_ipc_class')
record['main_ipc_large_group'] = i.get('main_ipc_large_group')
record['main_ipc_small_group'] = i.get('main_ipc_small_group')
record['agent'] = i.get('agent')
record['agent_organ'] = i.get('agent_organ')
record['unagent'] = i.get('unagent')
record['unagent_organ'] = i.get('unagent_organ')
record['priority'] = i.get('priority')
record['title_fr'] = i.get('title_fr')
record['title_de'] = i.get('title_de')
record['abstract_er'] = i.get('abstract_er')
record['abstract_de'] = i.get('abstract_de')
record['country_code'] = i.get('country_code')
record['title_cn'] = i.get('title_cn')
record['main_law_status'] = i.get('main_law_status')
record['sub_law_status'] = i.get('sub_law_status')
record['postal_code'] = i.get('postal_code')
record['i_application'] = i.get('i_application')
record['i_publication'] = i.get('i_publication')
record['date_entry'] = i.get('date_entry')
record['db_name'] = syn_mongo_DB
if count % 1000 == 0:
log.logger.info(
f"init record {file_mongo_db}-{file_open_date}, open_on: {open_no} count: {count} finish============================>")
docs.append({
"_index": syn_es_name, # mongodb数据库 == Es的index
"_id": i.pop('_id'),
"_source": record,
})
if syn_count_total != 1:
if count >= syn_count_total:
break
if len(docs) >= syn_count_one_time:
# 批量插入
helpers.bulk(es, docs, request_timeout=1800)
del docs[:]
log.logger.info(f"{file_mongo_db}-{file_open_date} sync {count} records to elasticsearch")
# 批量写入es,honor_data/tmp
if docs:
helpers.bulk(es, docs, request_timeout=1800)
del docs[:]
log.logger.info(f"{file_mongo_db}-{file_open_date} sync {count} records to elasticsearch")
if __name__ == '__main__':
syn_mongo_to_es()
同步txt中的数据到mongo中
最新推荐文章于 2022-10-24 10:10:07 发布