VGG16 详解
我的pytorch代码实现:vgg16
我们在vgg16神经网络上训练了SIGNS数据集,这是一个分类的数据集,在我的github上有介绍怎么下载数据集以及如何训练。
VGG16是一个卷积神经网络(CNN)架构,它在2014年的ILSVR(Imagenet)比赛中获胜。它被认为是迄今为止最优秀的视觉模型之一。VGG16最独特的地方在于,它不是使用大量的超参数,而是专注于使用3x3过滤器的卷积层,步幅为1,并始终使用相同的填充和2x2过滤器的最大池层。它始终在整个架构中一致地遵循这种卷积和最大池层的排列方式。最后,它有2个全连接层,后跟一个softmax输出。
VGG16网络使用pytorch实现
class VGG16(nn.Module):
def __init__(self, num_classes=6):
super(VGG16, self).__init__()
self.features = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(64, 64, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(128, 128, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Conv2d(128, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Conv2d(256, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2)
)
self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
self.classifier = nn.Sequential(
nn.Linear(512 * 7 * 7, 4096),
nn.ReLU(inplace=True),
nn.Dropout(),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Dropout(),
nn.Linear(4096, 1000),
nn.Linear(1000, num_classes),
)
def forward(self, x):
x = self.features(x)
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.classifier(x)
return x
加载数据集
# 定义一个训练图像加载器,指定图像的变换。
train_transformer = transforms.Compose([
transforms.Resize(64), # resize the image to 64x64 (remove if images are already 64x64)
transforms.RandomHorizontalFlip(), # randomly flip image horizontally 随机水平翻转图像
transforms.ToTensor()]) # transform it into a torch tensor 转换为tensor
# loader for evaluation, no horizontal flip
eval_transformer = transforms.Compose([
transforms.Resize(64), # resize the image to 64x64 (remove if images are already 64x64)
transforms.ToTensor()]) # transform it into a torch tensor
class SIGNSDataset(Dataset):
"""
A standard PyTorch definition of Dataset which defines the functions __len__ and __getitem__.
Dataset 的标准 PyTorch 定义,它定义了函数 __len__ 和 __getitem__
"""
def __init__(self, data_dir, transform):
"""
Store the filenames of the jpgs to use. Specifies transforms to apply on images.
存储要使用的 jpg 文件名。指定应用于图像的变换。
Args:
data_dir: (string) directory containing the dataset 数据集目录
transform: (torchvision.transforms) transformation to apply on image 要应用的图像变换
"""
self.filenames = os.listdir(data_dir) #获取指定目录下的所有文件名,并存储在self.filenames的列表中
# 遍历 filenames 列表下的所有文件名,如果文件名以 .jpg 结尾,则将其与 data_dir 拼接成一个完整的文件路径,并将这些路径添加到self.filenames列表中。
self.filenames = [os.path.join(data_dir, f) for f in self.filenames if f.endswith('.jpg')]
self.labels = [int(os.path.split(filename)[-1][0]) for filename in self.filenames]
self.transform = transform
def __len__(self):
# return size of dataset
return len(self.filenames)
def __getitem__(self, idx):
"""
Fetch index idx image and labels from dataset. Perform transforms on image.
从数据集中获取索引 idx 图像和标签。对图像执行变换。
Args:
idx: (int) index in [0, 1, ..., size_of_dataset-1]
Returns:
image: (Tensor) transformed image
label: (int) corresponding label of image
"""
image = Image.open(self.filenames[idx]) # PIL image
image = self.transform(image)
return image, self.labels[idx]
def fetch_dataloader(types, data_dir, params):
"""
Fetches the DataLoader object for each type in types from data_dir.
从 data_dir 抓取 types 中每种类型的 DataLoader 对象。
Args:
types: (list) has one or more of 'train', 'val', 'test' depending on which data is required
data_dir: (string) directory containing the dataset
params: (Params) hyperparameters
Returns:
data: (dict) contains the DataLoader object for each type in types
data:(dict)包含 types 中每种类型的数据加载器对象
"""
dataloaders = {} # 字典
for split in ['train', 'val', 'test']:
if split in types:
path = os.path.join(data_dir, "{}_signs".format(split))
# use the train_transformer if training data, else use eval_transformer without random flip
# 如果有训练数据,则使用 train_transformer,否则使用 eval_transformer 而不随机翻转
if split == 'train':
dl = DataLoader(SIGNSDataset(path, train_transformer), batch_size=params.batch_size, shuffle=True,
num_workers=params.num_workers,
pin_memory=params.cuda)
else:
dl = DataLoader(SIGNSDataset(path, eval_transformer), batch_size=params.batch_size, shuffle=False,
num_workers=params.num_workers,
pin_memory=params.cuda)
dataloaders[split] = dl
return dataloaders
优化器
optimizer = optim.Adam(model.parameters(), lr=params.learning_rate)