自定义的数据集:
从kaggle下载的猫狗数据集
import os
from PIL import Image
from torch.utils.data import Dataset
catdog_label = {"cats": 0, "dogs": 1}
class DogCatDataset(Dataset):
def __init__(self, data_dir, transform=None):
""""
猫狗分类任务的dataset
data_dir: str,数据集所在路径
transform: torch.transform, 数据预处理
"""
# self.label_name = {"cats": 0, "dogs": 1}
self.data_info = self.get_img_info(data_dir)
self.transform = transform
def __getitem__(self, index):
path_img, label = self.data_info[index]
img = Image.open(path_img).convert('RGB')
if self.transform is not None:
img = self.transform(img) # 进行transform
return img, label
def __len__(self):
return len(self.data_info)
@staticmethod
def get_img_info(data_dir):
data_info = list()
for root, dirs, _ in os.walk(data_dir):
# 遍历类别
for sub_dir in dirs:
img_names = os.listdir(os.path.join(root, sub_dir))
img_names = list(filter(lambda x: x.endswith('.jpg'), img_names))
# 遍历图片
for i in range(len(img_names)):
img_name = img_names[i]
path_img = os.path.join(root, sub_dir, img_name)
label = catdog_label[sub_dir]
data_info.append((path_img, int(label)))
return data_info
dataset 实际是根据索引(路径)获取图片和标签
调用dataset实现数据读取:
from torchvision import transforms
from DogCatDataset import DogCatDataset
import os
from torch.utils.data import DataLoader
# 文件夹路径
split_dir = 'DogCatDataset'
train_dir = os.path.join(split_dir, "training_set")
valid_dir = os.path.join(split_dir, 'test_set')
# transform
norm_mean = [0.485, 0.456, 0.406]
norm_std = [0.229, 0.224, 0.225]
data_transform = {
'train': transforms.Compose([
transforms.Resize((32, 32)),
transforms.RandomCrop(32, padding=4),
transforms.RandomGrayscale(p=0.9),
transforms.ToTensor(),
transforms.Normalize(norm_mean, norm_std)]),
'valid': transforms.Compose([
transforms.Resize((32, 32)),
transforms.ToTensor(),
transforms.Normalize(norm_mean, norm_std)])}
# 构建dataset
train_dataset = DogCatDataset(data_dir=train_dir, transform=data_transform['train'])
valid_dataset = DogCatDataset(data_dir=valid_dir, transform=data_transform['valid'])
# 构建DataLoder
train_loader = DataLoader(dataset=train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(dataset=valid_dataset, batch_size=16, shuffle=False)