1、cifar10数据集
Kaggle平台上的cifar10数据集的结构为:
其中test文件夹中包含有30 0000张图片,train文件夹中包含5 0000张图片
trainLabels.csv文件内容格式如下:
sampleSubmission.csv文件内容格式如下:
2、实战——keras generator读取cifar10数据集
class_names = [
'airplane',
'automobile',
'bird',
'cat',
'deer',
'dog',
'frog',
'horse',
'ship',
'truck',
]
train_lables_file = './cifar10/trainLabels.csv'
test_csv_file = './cifar10/sampleSubmission.csv'
train_folder = './cifar10/train/'
test_folder = './cifar10/test/'
def parse_csv_file(filepath, folder):
"""Parses csv files into (filename(全路径), label) format"""
results = []
with open(filepath, 'r') as f:
lines = f.readlines()[1:]
for line in lines:
image_id, label_str = line.strip('\n').split(',')
image_full_path = os.path.join(folder, image_id + '.png')
results.append((image_full_path, label_str))
return results
train_labels_info = parse_csv_file(train_lables_file, train_folder)
test_csv_info = parse_csv_file(test_csv_file, test_folder)
import pprint
pprint.pprint(train_labels_info[0:5])
pprint.pprint(test_csv_info[0:5])
print(len(train_labels_info), len(test_csv_info))
[('./cifar10/train/1.png', 'frog'), ('./cifar10/train/2.png', 'truck'), ('./cifar10/train/3.png', 'truck'), ('./cifar10/train/4.png', 'deer'), ('./cifar10/train/5.png', 'automobile')] [('./cifar10/test/1.png', 'cat'), ('./cifar10/test/2.png', 'cat'), ('./cifar10/test/3.png', 'cat'), ('./cifar10/test/4.png', 'cat'), ('./cifar10/test/5.png', 'cat')] 50000 300000
"""
在这里,一般会先把训练集切分为两部分:训练集和验证集。调整模型,当模型调整好后,
再用全部的训练集训练模型,
再在test上预测
"""
# train_df = pd.DataFrame(train_labels_info)
#切分训练集和验证集
train_df = pd.DataFrame(train_labels_info[0:45000])
valid_df = pd.DataFrame(train_labels_info[45000:])
test_df = pd.DataFrame(test_csv_info)
#给DataFrame设定列名
train_df.columns = ['filepath', 'class']
valid_df.columns = ['filepath', 'class']
test_df.columns = ['filepath', 'class']
print(train_df.head())
print(valid_df.head())
print(test_df.head())
filepath class 0 ./cifar10/train/1.png frog 1 ./cifar10/train/2.png truck 2 ./cifar10/train/3.png truck 3 ./cifar10/train/4.png deer 4 ./cifar10/train/5.png automobile filepath class 0 ./cifar10/train/45001.png horse 1 ./cifar10/train/45002.png automobile 2 ./cifar10/train/45003.png deer 3 ./cifar10/train/45004.png automobile 4 ./cifar10/train/45005.png airplane filepath class 0 ./cifar10/test/1.png cat 1 ./cifar10/test/2.png cat 2 ./cifar10/test/3.png cat 3 ./cifar10/test/4.png cat 4 ./cifar10/test/5.png cat
注意下面代码(dataframe)与10_monkeys数据集中的directory不同:
height = 32
width = 32
channels = 3
batch_size = 32
num_classes = 10
train_datagen = keras.preprocessing.image.ImageDataGenerator(
rescale = 1./255,
rotation_range = 40,
width_shift_range = 0.2,
height_shift_range = 0.2,
shear_range = 0.2,
zoom_range = 0.2,
horizontal_flip = True,
fill_mode = 'nearest',
)
train_generator = train_datagen.flow_from_dataframe(
train_df,
directory = './', #根目录:设成当前路径
x_col = 'filepath',
y_col = 'class',
classes = class_names,
target_size = (height, width),
batch_size = batch_size,
seed = 7,
shuffle = True,
class_mode = 'sparse',
)
valid_datagen = keras.preprocessing.image.ImageDataGenerator(
rescale = 1./255)
valid_generator = valid_datagen.flow_from_dataframe(
valid_df,
directory = './', #当前文件夹
x_col = 'filepath',
y_col = 'class',
classes = class_names, #依据class_names将y_col转为ID
target_size = (height, width),
batch_size = batch_size,
seed = 7,
shuffle = False,
class_mode = "sparse")
train_num = train_generator.samples
valid_num = valid_generator.samples
print(train_num, valid_num)
Found 45000 validated image filenames belonging to 10 classes. Found 5000 validated image filenames belonging to 10 classes. 45000 5000
for i in range(2):
x, y = train_generator.next()
print(x.shape, y.shape)
print(y)
(32, 32, 32, 3) (32,) [2. 1. 4. 4. 4. 4. 6. 5. 2. 8. 4. 6. 6. 3. 7. 1. 7. 2. 8. 8. 3. 0. 5. 3. 9. 1. 4. 5. 6. 7. 9. 2.] (32, 32, 32, 3) (32,) [0. 7. 2. 7. 5. 5. 7. 0. 5. 4. 9. 7. 6. 3. 0. 4. 4. 4. 6. 3. 5. 4. 6. 6. 4. 1. 8. 2. 4. 4. 3. 0.]
3、实现——模型训练与预测
网络结构:
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= conv2d (Conv2D) (None, 32, 32, 128) 3584 _________________________________________________________________ batch_normalization (BatchNo (None, 32, 32, 128) 512 _________________________________________________________________ conv2d_1 (Conv2D) (None, 32, 32, 128) 147584 _________________________________________________________________ batch_normalization_1 (Batch (None, 32, 32, 128) 512 _________________________________________________________________ max_pooling2d (MaxPooling2D) (None, 16, 16, 128) 0 _________________________________________________________________ conv2d_2 (Conv2D) (None, 16, 16, 256) 295168 _________________________________________________________________ batch_normalization_2 (Batch (None, 16, 16, 256) 1024 _________________________________________________________________ conv2d_3 (Conv2D) (None, 16, 16, 256) 590080 _________________________________________________________________ batch_normalization_3 (Batch (None, 16, 16, 256) 1024 _________________________________________________________________ max_pooling2d_1 (MaxPooling2 (None, 8, 8, 256) 0 _________________________________________________________________ conv2d_4 (Conv2D) (None, 8, 8, 512) 1180160 _________________________________________________________________ batch_normalization_4 (Batch (None, 8, 8, 512) 2048 _________________________________________________________________ conv2d_5 (Conv2D) (None, 8, 8, 512) 2359808 _________________________________________________________________ batch_normalization_5 (Batch (None, 8, 8, 512) 2048 _________________________________________________________________ max_pooling2d_2 (MaxPooling2 (None, 4, 4, 512) 0 _________________________________________________________________ flatten (Flatten) (None, 8192) 0 _________________________________________________________________ dense (Dense) (None, 512) 4194816 _________________________________________________________________ dense_1 (Dense) (None, 10) 5130 ================================================================= Total params: 8,783,498 Trainable params: 8,779,914 Non-trainable params: 3,584 _________________________________________________________________
epochs = 5
history = model.fit_generator(train_generator,
steps_per_epoch = train_num // batch_size,
epochs = epochs,
validation_data = valid_generator,
validation_steps = valid_num // batch_size)
#用训练好的模型在测试集上进行测试
test_datagen = keras.preprocessing.image.ImageDataGenerator(
rescale = 1./255)
test_generator = test_datagen.flow_from_dataframe(
test_df,
directory = './',
x_col = 'filepath',
y_col = 'class',
classes = class_names,
target_size = (height, width),
batch_size = batch_size,
seed = 7,
shuffle = False,
class_mode = "sparse")
test_num = test_generator.samples
print(test_num)
Found 300000 validated image filenames belonging to 10 classes. 300000
for i in range(2):
x, y = test_generator.next()
print(x.shape, y.shape) #(32, 32, 32, 3) (32,)
print(y)
(32, 32, 32, 3) (32,) [3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3.] (32, 32, 32, 3) (32,) [3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]
#预测:test_predict.shape:(30 0000, 10)
test_predict = model.predict_generator(test_generator,
workers = 5,#并行度
use_multiprocessing = False) #多进程做并行化
#使用最大概率的索引来获得它对应的类别预测
test_predict_class_indices = np.argmax(test_predict, axis = 1)#获得30 0000个整数,对应最大值的位置
test_predict_class = [class_names[index]
for index in test_predict_class_indices]
#写到submission文件中去
def generate_submissions(filename, predict_class):
with open(filename, 'w') as f:
f.write('id,label\n')
for i in range(len(predict_class)):
f.write('%d,%s\n' % (i+1, predict_class[i]))
output_file = "./cifar10/submission.csv"
generate_submissions(output_file, test_predict_class)
附代码:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import sklearn
import sys
import tensorflow as tf
import time
from tensorflow import keras
print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
print(module.__name__, module.__version__)
physical_devices = tf.config.experimental.list_physical_devices('GPU')
assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
tf.config.experimental.set_memory_growth(physical_devices[0], True)
class_names = [
'airplane',
'automobile',
'bird',
'cat',
'deer',
'dog',
'frog',
'horse',
'ship',
'truck',
]
train_lables_file = './cifar10/trainLabels.csv'
test_csv_file = './cifar10/sampleSubmission.csv'
train_folder = './cifar10/train/'
test_folder = './cifar10/test/'
def parse_csv_file(filepath, folder):
"""Parses csv files into (filename(全路径), label) format"""
results = []
with open(filepath, 'r') as f:
lines = f.readlines()[1:]
for line in lines:
image_id, label_str = line.strip('\n').split(',')
image_full_path = os.path.join(folder, image_id + '.png')
results.append((image_full_path, label_str))
return results
train_labels_info = parse_csv_file(train_lables_file, train_folder)
test_csv_info = parse_csv_file(test_csv_file, test_folder)
import pprint
pprint.pprint(train_labels_info[0:5])
pprint.pprint(test_csv_info[0:5])
print(len(train_labels_info), len(test_csv_info))
"""
在这里,一般会先把训练集切分为两部分:训练集和验证集。调整模型,当模型调整好后,
再用全部的训练集训练模型,
再在test上预测
"""
# train_df = pd.DataFrame(train_labels_info)
#切分训练集和验证集
train_df = pd.DataFrame(train_labels_info[0:45000])
valid_df = pd.DataFrame(train_labels_info[45000:])
test_df = pd.DataFrame(test_csv_info)
print(train_df.head())
print(valid_df.head())
print(test_df.head())
print(train_df.shape)
print(valid_df.shape)
print(test_df.shape)
#给DataFrame设定列名
train_df.columns = ['filepath', 'class']
valid_df.columns = ['filepath', 'class']
test_df.columns = ['filepath', 'class']
print(train_df.head())
print(valid_df.head())
print(test_df.head())
height = 32
width = 32
channels = 3
batch_size = 32
num_classes = 10
train_datagen = keras.preprocessing.image.ImageDataGenerator(
rescale = 1./255,
rotation_range = 40,
width_shift_range = 0.2,
height_shift_range = 0.2,
shear_range = 0.2,
zoom_range = 0.2,
horizontal_flip = True,
fill_mode = 'nearest',
)
train_generator = train_datagen.flow_from_dataframe(
train_df,
directory = './', #根目录:设成当前路径
x_col = 'filepath',
y_col = 'class',
classes = class_names,
target_size = (height, width),
batch_size = batch_size,
seed = 7,
shuffle = True,
class_mode = 'sparse',
)
valid_datagen = keras.preprocessing.image.ImageDataGenerator(
rescale = 1./255)
valid_generator = valid_datagen.flow_from_dataframe(
valid_df,
directory = './', #当前文件夹
x_col = 'filepath',
y_col = 'class',
classes = class_names, #依据class_names将y_col转为ID
target_size = (height, width),
batch_size = batch_size,
seed = 7,
shuffle = False,
class_mode = "sparse")
train_num = train_generator.samples
valid_num = valid_generator.samples
print(train_num, valid_num)
#还没有做数据增强?
for i in range(2):
x, y = train_generator.next()
print(x.shape, y.shape)
print(y)
model = keras.models.Sequential([
keras.layers.Conv2D(filters=128, kernel_size=3, padding='same',
activation='relu',
input_shape=[width, height, channels]),
keras.layers.BatchNormalization(),
keras.layers.Conv2D(filters=128, kernel_size=3, padding='same',
activation='relu'),
keras.layers.BatchNormalization(),
keras.layers.MaxPool2D(pool_size=2),
keras.layers.Conv2D(filters=256, kernel_size=3, padding='same',
activation='relu'),
keras.layers.BatchNormalization(),
keras.layers.Conv2D(filters=256, kernel_size=3, padding='same',
activation='relu'),
keras.layers.BatchNormalization(),
keras.layers.MaxPool2D(pool_size=2),
keras.layers.Conv2D(filters=512, kernel_size=3, padding='same',
activation='relu'),
keras.layers.BatchNormalization(),
keras.layers.Conv2D(filters=512, kernel_size=3, padding='same',
activation='relu'),
keras.layers.BatchNormalization(),
keras.layers.MaxPool2D(pool_size=2),
keras.layers.Flatten(),
keras.layers.Dense(512, activation='relu'),
keras.layers.Dense(num_classes, activation='softmax'),
])
model.compile(loss="sparse_categorical_crossentropy",
optimizer="adam", metrics=['accuracy'])
model.summary()
epochs = 5
history = model.fit_generator(train_generator,
steps_per_epoch = train_num // batch_size,
epochs = epochs,
validation_data = valid_generator,
validation_steps = valid_num // batch_size)
def plot_learning_curves(history, label, epcohs, min_value, max_value):
data = {}
data[label] = history.history[label]
data['val_'+label] = history.history['val_'+label]
pd.DataFrame(data).plot(figsize=(8, 5))
plt.grid(True)
plt.axis([0, epochs, min_value, max_value])
plt.show()
plot_learning_curves(history, 'accuracy', epochs, 0, 1)
plot_learning_curves(history, 'loss', epochs, 0, 2)
#用训练好的模型在测试集上进行测试
test_datagen = keras.preprocessing.image.ImageDataGenerator(
rescale = 1./255)
test_generator = test_datagen.flow_from_dataframe(
test_df,
directory = './',
x_col = 'filepath',
y_col = 'class',
classes = class_names,
target_size = (height, width),
batch_size = batch_size,
seed = 7,
shuffle = False,
class_mode = "sparse")
test_num = test_generator.samples
print(test_num)
for i in range(2):
x, y = test_generator.next()
print(x.shape, y.shape) #(32, 32, 32, 3) (32,)
print(y)
#预测
test_predict = model.predict_generator(test_generator,
workers = 5,#并行度
use_multiprocessing = False) #多进程做并行化
#使用最大概率的索引来获得它对应的类别预测
test_predict_class_indices = np.argmax(test_predict, axis = 1)#获得30 0000个整数,对应最大值的位置
test_predict_class = [class_names[index]
for index in test_predict_class_indices]
#写到submission文件中去
def generate_submissions(filename, predict_class):
with open(filename, 'w') as f:
f.write('id,label\n')
for i in range(len(predict_class)):
f.write('%d,%s\n' % (i+1, predict_class[i]))
output_file = "./cifar10/submission.csv"
generate_submissions(output_file, test_predict_class)