异步多进程运行方法:
import numpy as np
import scipy.io as sio
import cv2
import os
import lmdb
import math
import pickle
from tqdm import tqdm
from multiprocessing import Process, Manager, Lock, Pool
print('start loading gt file')
gts = sio.loadmat('SynthText/gt.mat')
image_paths = gts['imnames'][0]
txt_anns = gts['txt'][0]
dataset_length = len(image_paths)
print('loading done!')
env = lmdb.open('SynthText.lmdb', map_size=1099511627776)
TARGET_HEIGHT = 416
TARGET_WIDTH = 608
def write_cache(env, cache):
with env.begin(write=True) as txn:
for k, v in cache.items():
txn.put(k, v)
def worker(index, num_workers):
interp = int(math.ceil(dataset_length / num_workers))
start = index * interp
end = min((index + 1) * interp, dataset_length)
step = 0
cache = {}
for i in tqdm(range(start, end)):
step += 1
image_path = str(image_paths[i][0])
image = cv2.imread('SynthText/' + image_path, 1)
# height, width, _ = image.shape
image = cv2.resize(image, (TARGET_WIDTH, TARGET_HEIGHT), interpolation=cv2.INTER_LINEAR)
# ratio_h, ratio_w = 1.0 * TARGET_HEIGHT / height, 1.0 * TARGET_WIDTH / width
word_ann = []
txt_ann = txt_anns[i]
for j in range(len(txt_ann)):
bbox_ann = txt_ann[j].split('\n')
for k in range(len(bbox_ann)):
word_ann.extend(bbox_ann[k].strip().split(' '))
# print(txt_ann)
# print(word_ann)
pkl_image = cv2.imencode('.jpg', image)[1]
pkl_label = pickle.dumps(word_ann)
cache[('image-' + image_path).encode()] = pkl_image
cache[('label-' + image_path).encode()] = pkl_label
if step % 100 == 0:
write_cache(env, cache)
cache = {}
write_cache(env, cache)
process_list = []
num_workers = 80
p = Pool(num_workers)
for i in range(num_workers):
p.apply_async(worker, args=(i, num_workers))
p.close()
p.join()
如果要在多个进程中更改同一个全局变量,需要使用Manager()。