Python制作并读取二分类数据集(猫狗示例)
一、新建文件目录
1.新建文件目录datasets保存两个子目录(两个类别):
- datasets
-
cats
- cat01.jpg
- …
-
dogs
- dog01.jpg
- …
-
二、导入所需的第三方库
import os
import cv2
import numpy as np
import random
from sklearn.model_selection import train_test_split
from keras.utils import np_utils
三、读取并生成对应的标签
1.输入图片文件夹的路径,保存文件夹下所有的图片的路径进一个列表里面:
def get_files(input_dir):
file_list = []
for (path, dir_name, file_names) in os.walk(input_dir):
# path: 顶层文件夹
# dir_name:(如果有)当前文件夹下的文件夹
# file_names: 包含当前文件夹下所有文件名的列表
for file_name in file_names:
if file_name.endswith('.jpg') or file_name.endswith('.png') or file_name.endswith('.bmp'):
# 完成的图片路径
full_img_path = os.path.join(path, file_name)
# 将图片路径添加进列表里面
file_list.append(full_img_path)
return file_list
2.读取图片并调整图片的大小,生成对应的label列表:
def read_img_label(file_list, label, size):
imges = []
labels = []
for img_path in file_list:
img = cv2.imread(img_path)
# 调整图像大小
img = cv2.resize(img, (size, size))
imges.append(img)
labels.append(label)
return imges, labels
3.读取含有两个类别的数据集文件夹,设置标签值, 合并图片列表和标签列表:
def read_datasets(data_dir, class_1, class_2, size):
# data_dir: 数据集目录名
# class_1: 类别1目录名
# class-2:类别2目录名
label_1 = 0
label_2 = 1
class_dir_1 = os.path.join(data_dir, class_1)
class_dir_2 = os.path.join(data_dir, class_2)
file_path_1 = get_files(class_dir_1)
file_path_2 = get_files(class_dir_2)
imges_1, labels_1 = read_img_label(file_path_1, label_1, size)
imges_2, labels_2 = read_img_label(file_path_2, label_2, size)
img_array = np.array(imges_1 + imges_2)
label_array = np.array(labels_1 + labels_2)
return img_array, label_array
四、划分成训练集和测试集
def load_data(data_dir, class_1, class_2, size):
imges, labels = read_datasets(data_dir, class_1, class_2, size)
imges = imges.reshape(imges.shape[0], size, size, 3)
x_train, x_test, y_train, y_test = train_test_split(imges, labels, test_size=0.3, random_state=random.randint(0, 100))
return (x_train, y_train), (x_test, y_test)
读取数据集并进行归一、独热转化
(x_train, y_train), (x_test, y_test) = load_data('./datasets', 'cats', 'dogs', 416)
x_train = x_train.astype('float32') / 255
x_test = x_test.astype('float32') / 255
y_train = np_utils.to_categorical(y_train, 2)
y_test = np_utils.to_categorical(y_test, 2)