import os
import lmdb # install lmdb by "pip install lmdb"
import cv2
import numpy as np
from keys import *
from glob import glob
import functools
def checkImageIsValid(imageBin):
if imageBin is None:
return False #数组的具体的值映射到具体的0-255的uint8类型
imageBuf = np.fromstring(imageBin, dtype=np.uint8)#通过这个二进编码获得图像一维的ndarray数组信息
#print('----------------------------:',imageBuf.shape)
img = cv2.imdecode(imageBuf, cv2.IMREAD_GRAYSCALE)#通过数组进行图像获取二维32*280大小
#print('============================:',img.shape)
imgH, imgW = img.shape[0], img.shape[1]
if imgH * imgW == 0:
return False
return True
def writeCache(env, cache):
with env.begin(write=True) as txn:
for k in cache:
#print('1111111111111111111:',k)
v = cache[k] #value存放的是图片和标签的二进制编码信息
#print('2222222222222222222:',v)
txn.put(k.encode(), v) #key转换为二进制编码信息,label-000000001 -> b'label-000000001'
#print('3333333333333333333:',k.encode())
####最终存放的结果: b'image-*' : b'图片的具体的二进制编码信息'
#: b'label-*' : b'标签文字的具体二进制编码信息'
def createDataset(outputPath, imagePathList, labelList, lexiconList=None, checkValid=True):
"""
Create LMDB dataset for CRNN training.
ARGS:
outputPath : LMDB output path
imagePathList : list of image path
labelList : list of corresponding groundtruth texts
lexiconList : (optional) list of lexicon lists
checkValid : if true, check the validity of every image
"""
assert(len(imagePathList) == len(labelList))
nSamples = len(imagePathList)
#如果train文件夹下没有data.mbd或lock.mdb文件,则会生成一个空的
#map_size:定义的是一个lmdb文件的存储容量,(定义1T)
env = lmdb.open(outputPath, map_size=1099511627776)
cache = {}
cnt = 1
for i in range(nSamples):
imagePath = imagePathList[i]
label = labelList[i]
if not os.path.exists(imagePath):
print('%s does not exist' % imagePath)
continue
with open(imagePath, 'rb') as f:
imageBin = f.read() #以二进制文件打开进行读取
#print('---------------------:',imageBin)#获取的是具体图片的二进制编码信息
if checkValid:
if not checkImageIsValid(imageBin):
print('%s is not a valid image' % imagePath)
continue
imageKey = 'image-%09d' % cnt #image-000000001(when cnt = 1)
labelKey = 'label-%09d' % cnt #key从1开始,这与后续lmdb信息获取阶段的代码相对应
cache[imageKey] = imageBin #存放的是图片具体的二进制编码信息
cache[labelKey] = label #存放label的二进制编码信息
if lexiconList:
lexiconKey = 'lexicon-%09d' % cnt
cache[lexiconKey] = ' '.join(lexiconList[i])
if cnt % 1000 == 0: #cache字典每次存放1000个字符
writeCache(env, cache)
cache = {}
print('Written %d / %d' % (cnt, nSamples))#每1000次进行一次打印
cnt += 1
nSamples = cnt-1
cache['num-samples'] = str(nSamples).encode()
writeCache(env, cache)
print('Created dataset with %d samples' % nSamples)
'''
def custom_sort(x, y):#排序(按图像宽度降序)
imgI = Image.open(os.path.join(imgPath, x.split(' ')[0])).convert('L')
#print(imgI.size)
imgJ = Image.open(os.path.join(imgPath, y.split(' ')[0])).convert('L')
if imgI.size[0] < imgJ.size[0]:
return 1
if imgI.size[0] > imgJ.size[0]:
return -1
return 0
'''
if __name__ == '__main__':
OutPath = '/home/renpeng/TextRecognition/chineseocr0919/crnn.pytorch/LMDBPATH2/'#这是进行试验的一个路径
imagePath=[]
imgPathL = '/mnt/renpeng/textImage2/'
#print(imgPathL)
labPathL = '/mnt/renpeng/textLabel22.txt'
'''
mytmp = ''
#file_names = []
with open(labPathL,'r') as f:
file_names = f.readLines() #这是一个list对象,进行排序
file_names = sorted(file_names, key=functools.cmp_to_key(custom_sort))
#print(file_names)
'''
labL=[]
with open(labPathL, 'r') as f:
while True:
ss = f.readline()
if not ss :
break
label=ss.split(' ')[1].replace('\n','')
label = ''.join([ x for x in label if x in alphabetChinese ])#此时包括两个特殊字符
#print(label)
'''
flag = 1
for c in label:
if c not in alphabetChinese:
#print('label = ',label)
flag = 0
break
if flag == 0:
continue
'''
if len(ss.split()) != 2:
#print(ss)
continue
imageName = ss.split(' ')[0]
imageName = imgPathL + imageName #字符串拼接函数
imagePath.append(imageName)
#print(ss)
#label=ss.split(' ')[1].replace('\n','')
#label = ''.join([ x for x in label if x in alphabetChinese ])
#print('11111111111111111111111:',imageName) #文件夹绝对路径
#print('22222222222222222222222:',label) #对应label字符
#print('33333333333333333333333:',label.encode())#对应encode()字符编码之后的结果相当于rb的效果
labL.append(label.encode())#以二进制编码的信息进行存储
#r encode = rb
#print(labL)
print(len(imagePath))
print(len(labL))
createDataset(OutPath,imagePath,labL)
#cache = {}
#env = lmdb.open(OutPath, map_size=1099511627776)
#cache['num-samples'] = str(2913000).encode()
#writeCache(env, cache)
'''
###############构造英文
OutPath = '/home/renpeng/TextRecognition/chineseocr0919/crnn.pytorch/LMDBPATH/'
imgPathL = glob('/home/renpeng/TextRecognition/chineseocr0919/crnn.pytorch/0/*.jpg')
#print(imgPathL)
labPathL = glob('/home/renpeng/TextRecognition/chineseocr0919/crnn.pytorch/0/*.txt')
labL=[]
for i in labPathL:
with open(i, 'rb') as f:
ss = f.read()
labL.append(ss)
print(labL)
'''
'''
OutPath = '/home/renpeng/TextRecognition/chineseocr0919/crnn.pytorch/LMDBPATH/'
imgPathL = glob('/home/renpeng/TextRecognition/chineseocr0919/crnn.pytorch/0/*.jpg')
labPathL = glob('/home/renpeng/TextRecognition/chineseocr0919/crnn.pytorch/0/*.txt')
labL=[]
imagePath = []
for j in range(100):
imagePath += imgPathL
for i in labPathL:
with open(i, 'rb') as f:
ss = f.read()
labL.append(ss)
print(labL)
'''
#createDataset(OutPath,imagePath,labL)