1.批量读取数据
# 获取文件夹下所有文件的路径
def file_path(dirname):
# 将文件路径存到redis
redis.expire("json_path", 0)
for root, _, files in os.walk(dirname):
for f in files:
file_path = os.path.join(root, f)
# 将文件路径写死
# 将每一个json path存入redis
redis.lpush("json_path", file_path)
logger.info(f"{file_path} to redis!")
length = redis.llen("json_path")
logger.info(f"总共有 --------{length}个文件 -------")
logger.info(f"总共有 --------{length}个文件 -------")
logger.info(f"总共有 --------{length}个文件 -------")
# 读取一个文件的所有数据
def read_json():
process = os.getpid()
while True:
json_path = redis.rpop("json_path")
if json_path is None:
break
path = bytes.decode(json_path)
logger.info(f"process_{process} start read | {path} ")
start = timeit.default_timer()
with open(path, 'r', encoding='utf-8') as f:
i = 0
count = 0
mongo_save_list = []
line_data = 'a'
f1 = open(r"上传失败.txt", 'a+', encoding='utf-8')
f2 = open(r"上传成功.txt", 'a+', encoding='utf-8')
while line_data:
try:
line_data = f.readline()
if line_data:
try:
data = json.loads(line_data) # 把读取的一行的json数据加载出来
except Exception as e:
print('error1:', e)
count += 1
print("上传失败{}条".format(count))
f1.write("上传失败{}条".format(count) + '\n')
pass
i += 1
print("上传成功{}条".format(i))
f2.write("上传成功{}条".format(i) + '\n')
tx = data
# mongo_save_list = []
mongo_save_list.append(tx)
if len(mongo_save_list) == 1000:
# print("1",len(mongo_save_list))
yield mongo_save_list
del(mongo_save_list)
mongo_save_list = []
# print("2",len(mongo_save_list))
else:
yield mongo_save_list
return StopIteration
except Exception as e:
print('error2', e)
elapsed = (timeit.default_timer() - start)
logger.info(f"process_{process} readall success! | used {elapsed} s")
2.存入mongo
"""存入本地mongo"""
def init_motor(datas):
db_url = '127.0.0.1'
db_name = "test"
db_collection = "test"
db_port = 27017
# 建立连接
# 同步pymongo
client = pymongo.MongoClient(db_url, db_port)
# 连接某个库名字
db = client[db_name][db_collection]
# 批量插入
try:
db.insert_many(datas, ordered=False, bypass_document_validation=True)
print('正在插入数据。。。')
except BulkWriteError as e:
print('84error', e)
logger.error(e.details)
# 将数据存入mongo
def save_to_mongo():
data = read_json()
for d in data:
init_motor(d)
def dir_path():
DIRPATH = r"json_files"
return DIRPATH
if __name__ == '__main__':
start = timeit.default_timer()
dirpath = dir_path()
# 生成文件路径
file_path(dirpath)
# 多进程保存
process_count = 50
with multiprocessing.Pool(process_count) as pool:
for i in range(process_count):
pool.apply_async(save_to_mongo)
pool.close()
pool.join()
elapsed = (timeit.default_timer() - start)
logger.info(f"所有数据存储完毕:| used {elapsed} s")