数据集预处理
(1)对每一张图片进行resize, resize到特定的大小50*150
(2)rebalance处理,对少数类样本进行随机选择n张进行数据增强之后重新加入到dataset中。
(3)划分训练集和测试集,0.2
(4)对训练集进行数据增强。 扩大训练数据量。 (操作包括: 翻转,滤波等)
#-*- encoding: utf-8 -*-
import os, sys, cv2
import numpy as np
import random
image_cnt = 0
MIN_HEIGHT = 120 #处理的最小尺寸
MIN_WIDTH = 40
targetLabel = []
positive_cnt = 0
negative_cnt = 0
def readImage( filePath , targetDir ): #制定标签
global image_cnt, positive_cnt, negative_cnt
global targetLabel
if not os.path.isdir( filePath ):
print('{} is not a dir'.format(filePath))
return None
listFile = os.listdir( filePath )
labelDict = {
}
with open( filePath + 'Label.txt', 'r') as reader:
for line in reader:
lines = line.split()
for i in range(1, len(lines)):
if lines[i] == 'personalMale': #男设为1
label = 1
elif lines[i] == 'personalFemale': #女设为0
label = 0
else:
continue
labelDict[lines[0]] = label
break
for i in range(len(listFile)):
if len(listFile[i]) > 4 and (listFile[i][-4:] == '.bmp' or listFile[i][-4:] == '.jpg' or \
listFile[i][-4:] == '.png' or listFile[i][-5:] == '.jpeg'):
imageName = filePath + listFile[i]
img = cv2.imread( imageName )
if not img.data:
continue
height, width = img.shape[:2]
if height < MIN_HEIGHT or width < MIN_WIDTH:
continue
fileName = str( image_cnt ) + '.jpeg'
identity = listFile[i].find('_')
if identity == -1:
identity = len(listFile[i])
idd = listFile[i][:identity]
if labelDict.has_key( idd ) :
targetLabel.append([ fileName, labelDict[idd]])
if labelDict[idd] == 0:
negative_cnt += 1
else:
positive_cnt += 1
img = cv2.resize(img, (50, 150), interpolation=cv2.INTER_CUBIC) #60*60
cv2.imwrite(targetDir + fileName, img)
image_cnt += 1
else:
print('file {} do not have label'.format(listFile[i]) )
####### pyramid operator
def MinAndEnlarge(img, Minus_pixel = 3): #定义放缩图片大小50*150
img = img[(3*Minus_pixel):(150 - 3*Minus_pixel), Minus_pixel:(50 - Minus_pixel), :]
img = cv2.resize(img, (50, 150), interpolation = cv2.INTER_CUBIC )
return img
####### rotate operator
def Flip(img, operator = 1):
if operator == 1:
img = cv2.flip(img, 1)
else:
img = cv2.flip(img, 0)
return img
####### median blurring the image
def Blur(img, kernel_size=5):
img = cv2.medianBlur(img, kernel_size)
return img
def saveLabel( targetDir ): #存储标签
global targetLabel
with open(targetDir + 'label.txt', 'w') as writer:
for i in range(len(targetLabel)):
writer.write( str( targetLabel[i][0] )