数据描述
总共15000张苹果图片,全部由专家人为打分(分数范围0-100),任务为给苹果外观进行打分。4500张作为验证集,4500张作为测试集,其余6000张作为训练集。对训练集使用数据增强,将每张图片进行一次增强,得到12000张训练数据。
数据增强
使用Keras自带模块进行数据增强。
from keras.preprocessing import image # 图像处理工具的模块
from keras.preprocessing.image import ImageDataGenerator # 数据生成器
'''
将所有图片缩小到原来的1/255(归一化), 便于神经网络训练
将图片进行旋转平移剪切翻转缩放。
'''
train_datagen = ImageDataGenerator(rescale=1./255,
rotation_range=70,
width_shift_range=0.2,
height_shift_range=0.2,
zoom_range=[0.8,1.2],
horizontal_flip=True,
zoom_range=[0.8,1.2],
vertical_flip=True)
数据矩阵创建
# 读取分数
train_excel = pd.read_excel(r"D:\train.xlsx")
val_excel = pd.read_excel(r"D:\validation.xlsx")
test_excel = pd.read_excel(r"D:\test.xlsx")
分数表格如下图所示
# 读取图片
img_width, img_height, channels = 224, 224, 3
train_sample_dir = 'D:\\train\\'
train_nb_samples = len(os.listdir(train_sample_dir))
val_sample_dir = 'D:\\validation\\'
val_nb_samples = len(os.listdir(val_sample_dir))
# 训练集
train_filenames = train_excel.groupby('Filename').size().index.tolist()
train_labels = []
for filename in train_filenames:
df = train_excel[train_excel['Filename'] == filename]
count = Counter(df['Rating']).most_common(1)[0][0]
score = round(df['Rating'].mean(), 2)
train_labels.append({'Filename': train_sample_dir+filename, 'score': score})
train_labels_df = pd.DataFrame(train_labels)
train_files = [train_sample_dir+i for i in os.listdir(train_sample_dir)]
# 验证集
val_filenames = val_excel.groupby('Filename').size().index.tolist()
val_labels = []
for filename in val_filenames:
df = val_excel[val_excel['Filename'] == filename]
count = Counter(df['Rating']).most_common(1)[0][0]
score = round(df['Rating'].mean(), 2)
val_labels.append({'Filename': val_sample_dir+filename, 'score': score})
val_labels_df = pd.DataFrame(val_labels)
val_files = [val_sample_dir+i for i in os.listdir(val_sample_dir)]
数据生成器
将NumPy数组持续地分批次地喂给神经网络。由于内存有限,所以将数据分开导入神经网络,而不是一次性全部读取。
# 训练集数据生成器
def train_image_generator(files, batch_size):
while True:
for i in range(0, len(files)//batch_size):
train_total = np.empty(shape=(batch_size, img_width, img_height, channels), dtype=np.float)
train_label = np.empty(shape=(batch_size, 1), dtype=np.float)
for k, j in enumerate(range(i*batch_size, (i+1)*batch_size)):
train_total[k] = imread(files[j]) / 255.
train_label[k] = train_labels_df[train_labels_df.Filename==train_files[j]].score.values.astype('float')
yield train_total, train_label
# 验证集数据生成器
def val_image_generator(files, batch_size):
while True:
for i in range(0, len(files)//batch_size):
val_total = np.empty(shape=(batch_size, img_width, img_height, channels), dtype=np.float)
val_label = np.empty(shape=(batch_size, 1), dtype=np.float)
for k, j in enumerate(range(i*batch_size, (i+1)*batch_size)):
val_total[k] = imread(files[j]) / 255.
val_label[k] = val_labels_df[val_labels_df.Filename==val_files[j]].score.values.astype('float')
yield val_total, val_label