项目场景:
LMDB数据库可以同时由多个进程打开,使用内存映射的方式访问文件使得文件内寻址的开销非常小,能有效减少数据集复制/传输过程的开销,具有极高的数据存取速度
最近尝试将数据集写入LMDB以提高模型训练速度
使用以下代码将数据集多进程加速写入LMDB
def write_lmdb(train_image_list):
env = lmdb.open('../dataset/train_', 13610000000)
cache = {}
for idx, (image, label, bbox, landmark) in tqdm(enumerate(train_image_list), total=3190000):
imgKey = f'{idx}'.encode()
img = cv2.imread(image)
paraKey = f'{idx}_'.encode()
label_str = str(label)
bbox_str = ' '.join([str(x) for x in bbox])
landmark_str = ' '.join([str(x) for x in landmark])
para = ','.join([label_str, bbox_str, landmark_str])
cache[imgKey] = img.tobytes()
cache[paraKey] = para.encode()
if (idx + 1) % 10000 == 0:
with env.begin(write=True) as txn:
for k, v in cache.items():
txn.put(k, v)
cache = {}
if __name__ == '__main__':
data_folder = '../dataset'
outputPath = '../dataset/train'
print('开始写入lmdb')
cores = multiprocessing.cpu_count()
pool = Pool(cores)
pool.apply_async(write_lmdb, [train_image_list])
pool.close()
pool.join()
问题描述
写入完成后重写了Dataset函数
看起来没什么问题,但是使用这个Dataset加载数据集,模型准确率下降了十几个点,完全训练不上去!
class MyDataset(Dataset):
def __init__(self, lmdbPath):
super(MyDataset, self).__init__()
env = lmdb.open(lmdbPath)
self.imgs = []
self.labels = []
self.bboxs = []
self.landmarks = []
with env.begin(write=True) as txn:
num = txn.stat()['entries']
for key, value in tqdm(txn.cursor(), total=num):
if '_' in key.decode():
label, bbox, landmark = value.decode().split(',')
label = int(label)
bbox = [float(x) for x in bbox.split()]
landmark = [float(x) for x in landmark.split()]
label = np.array([label], np.int64)
bbox = np.array(bbox, np.float32)
landmark = np.array(landmark, np.float32)
self.labels.append(label)
self.bboxs.append(bbox)
self.landmarks.append(landmark)
else:
img = np.frombuffer(value, np.uint8).reshape(12, 12, 3).transpose((2, 0, 1))
# 归一化
img = (img.astype('float32') - 127.5) / 128
img = torch.from_numpy(img)
self.imgs.append(img)
def __getitem__(self, index):
img = self.imgs[index]
label = self.labels[index]
bbox = self.bboxs[index]
landmark = self.landmarks[index]
return img, label, bbox, landmark
def __len__(self):
return len(self.imgs)
原因分析:
刚开始怀疑使Dataloader里的shuffle未能有效应用所导致,随后发现即使将原先的数据集shuffle设置为False也有一定的拟合效果。
经过一整天的Debug,最后发现是写入的LMDB数据排列问题,如下:
- index是按照0,1,2,3…的顺序
而列表中的图片顺序:0,1,10,100,1000,1001,1002…
标签顺序:1,1000,1001,1002…
如果使用append按依次添加至列表则无法将图片与之对应的标签一一对应
例如:
index:[0,1,2,3,4…]
img:[img0,img1,img10,img100,img1000,img1001…1…2…]
label:[label0,label1000,label1001,label1002…10009,110…2…]
index:0->img:0,label:0
index:1->img:1,label:1000
index:2->img:10,label:1001
如果index=2,得到的是img10和label1000,即第10张图片和第1001张图片的标签对应起来了
解决方案:
使用字典替换列表,将图片和标签对应
class MyDataset(Dataset):
def __init__(self, lmdbPath):
super(MyDataset2, self).__init__()
env = lmdb.open(lmdbPath)
self.imgs = {}
self.labels = {}
self.bboxs = {}
self.landmarks = {}
with env.begin(write=True) as txn:
num = txn.stat()['entries']
for key, value in tqdm(txn.cursor(), total=num):
if '_' in key.decode():
target, label, bbox, landmark = value.decode().split(',')
key = int(key.decode()[0:-1])
label = int(label)
bbox = [float(x) for x in bbox.split()]
landmark = [float(x) for x in landmark.split()]
label = np.array([label], np.int64)
bbox = np.array(bbox, np.float32)
landmark = np.array(landmark, np.float32)
self.labels[key] = label
self.bboxs[key] = bbox
self.landmarks[key] = landmark
else:
key = int(key)
img = np.frombuffer(value, np.uint8).reshape(12, 12, 3).transpose((2, 0, 1))
# 归一化
img = (img.astype('float32') - 127.5) / 128
img = torch.from_numpy(img)
self.imgs[key] = img
assert len(self.imgs) == len(self.labels) == len(self.bboxs) == len(self.landmarks)
def __getitem__(self, index):
img = self.imgs.get(index)
label = self.labels.get(index)
bbox = self.bboxs.get(index)
landmark = self.landmarks.get(index)
return img, label, bbox, landmark
def __len__(self):
return len(self.imgs)
完美解决,模型已经可以正常拟合,希望能帮到各位。