1.需求:现在要求将服务器上523GB的json文件,导入到mongo库中,由于mongoimport 速度在25MB/s,
现在用脚本多进程将数据读取,在写入mongo中.
2.源码:
# -*- coding: utf-8 -*-
"""
@Time : 2020/7/15 14:49
@Athor : LinXiao
@功能 :读取json文件,存入mongo
"""
# ------------------------------
# 读json文件
import json
import multiprocessing
import os
import platform
import timeit
import pymongo
from loguru import logger
from pymongo.errors import BulkWriteError
from redis import Redis
redis=Redis(db=5)
def dir_path():
if platform.system() == 'Linux' or platform.system() == 'Darwin':
DIRPATH=r"/backup2/btc_tx/btc_outputlist_tx"
else:
DIRPATH=r"D:\Project\etc\Chain-Storage\src\core\test\test2"
return DIRPATH
# 获取文件夹下所有文件的路径
def file_path(dirname):
redis.expire("json_path", 0)
for root, _, files in os.walk(dirname):
for f in files:
file_path=os.path.join(root, f)
# 将每一个json path存入redis
redis.lpush("json_path", file_path)
logger.info(f"{file_path} to redis!")
length=redis.llen("json_path")
logger.info(f"总共有 --------{length}个文件 -------")
logger.info(f"总共有 --------{length}个文件 -------")
logger.info(f"总共有 --------{length}个文件 -------")
# 读取一个文件的所有数据
def read_json():
process=os.getpid()
while True:
json_path=redis.rpop("json_path")
if json_path is None:
break
path=bytes.decode(json_path)
logger.info(f"process_{process} start handle | {path} ")
start=timeit.default_timer()
with open(path, 'r') as f:
mongo_save_list=[]
while True:
data=f.readline()
if not data:
break
tx=json.loads(data)
mongo_save_list.append(tx)
elapsed=(timeit.default_timer() - start)
logger.info(f"process_{process} write success! | used {elapsed} s")
return mongo_save_list
def init_motor(datas):
# db_url='127.0.0.1'
db_url='172.26.178.248'
db_port=27017
db_name="btc_tx_new_test"
db_collection="new_transaction"
# 建立连接
# 同步pymongo
client=pymongo.MongoClient(db_url, db_port)
# 连接某个库名字
db=client[db_name][db_collection]
# 批量插入
try:
db.insert_many(datas, ordered=False, bypass_document_validation=True)
except BulkWriteError as e:
logger.error(e.details)
# 将数据存入mongo
def save_to_mongo():
data=read_json()
init_motor(data)
if __name__ == '__main__':
dirpath = dir_path()
# 生成文件路径
file_path(dirpath)
# 多进程保存
process_count=100
with multiprocessing.Pool(process_count) as pool:
for i in range(process_count):
pool.apply_async(save_to_mongo)
pool.close()
pool.join()