理论参见:高效深度学习软硬件设计——神经网络压缩、 Pruning模型剪枝、权值共享、低秩近似_iwill323的博客-CSDN博客
目录
Depthwise Separable Convolution
任务和数据集
任务描述
● 网络压缩:让模型更小,同时不损失性能
● 训练一个非常小的模型来完成HW3的任务
数据集
数据集来自HW3的food-11数据集,总共有11个类别。
● Training set:9866 labeled images
● Validation set:3430 labeled images
● Evaluation set:3347 images
导包
import numpy as np
import pandas as pd
import torch
import os
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from PIL import Image
from torch.utils.data import ConcatDataset, DataLoader, Subset, Dataset # "ConcatDataset" and "Subset" are possibly useful.
from torchvision.datasets import DatasetFolder, VisionDataset
from sklearn.model_selection import StratifiedShuffleSplit
from torchsummary import summary
from tqdm.auto import tqdm
import random
from d2l import torch as d2l
辅助函数
same_seeds用于固定随机种子,log函数用于日志记录
def same_seeds(seed):
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
def log(log_fw, text): # define a logging function to trace the training process
print(text)
log_fw.write(str(text)+'\n')
log_fw.flush()
数据处理
显示文件夹和文件数量
# running this will list all files under the input directory
for dirname, _, filenames in os.walk('/kaggle/input'):
if len(filenames) > 0:
print(f"{dirname}: {len(filenames)} files.") # Show the .jpg file amounts in each split.
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
/kaggle/input/ml2022spring-hw13/food11-hw13: 1 files. /kaggle/input/ml2022spring-hw13/food11-hw13/validation: 3430 files. /kaggle/input/ml2022spring-hw13/food11-hw13/training: 9866 files. /kaggle/input/ml2022spring-hw13/food11-hw13/evaluation: 3347 files.
transforms
test_tfm = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
train_tfm = transforms.Compose([
# add some useful transform or augmentation here, according to your experience in HW3.
transforms.RandomResizedCrop((224, 224), scale=(0.5, 1), ratio=(0.5, 2)),
# You can change this, but be aware of that the given teacher model's input size is (3, 224, 224).
# Thus, Input size other then 224 might hurt the performance. please be careful.
transforms.RandomHorizontalFlip(0.5),
transforms.RandomRotation(180),
transforms.RandomAffine(30),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
Dataset
传统机器学习阶段(数据集在万这个数量级),一般分配比例为6:2:2。当数据量非常大(百万级)时,即使拿1%的数据做test也有一万之多,已经足够了。可以拿更多的数据做训练。因此常见的比例可以达到98:1:1 ,甚至可以达到99.5:0.3:0.2等。可以参考训练集、验证集和测试集 - 知乎
本作业数据集中,训练集和验证集数据比例为3:1,既然不用划分出测试集,所以我想将训练集比例提高,所以改动了一下原代码,重新划分训练集和数据集,好将训练集和验证集数据比例设置为9:1
class FoodDataset(Dataset):
def __init__(self, files, labels, tfm=test_tfm):
super().__init__()
self.files = files
self.labels = labels
print(f"One sample",self.files[0])
self.transform = tfm
def __len__(self):
return len(self.files)
def __getitem__(self,idx):
fname = self.files[idx]
im = Image.open(fname)
im = self.transform(im)
if self.labels is not None:
label = self.labels[idx]
else:
label = -1 # test has no label
return im, label
数据加载函数
使用StratifiedShuffleSplit进行分层采样。效果见最后一个部分的讨论
def loadData(dataset_dir, batch_train, batch_valid, num_workers, valid_ratio, train_tfm, test_tfm):
# 把所有文件名都加上路径前缀,因为训练集和验证集处于不同的文件夹
train_path = os.path.join(dataset_dir, 'training')
train_files = sorted([os.path.join(train_path, x) for x in os.listdir(train_path) if x.endswith(".jpg")])
train_labels = [int(x.split('/')[-1].split('_')[0]) for x in train_files]
val_path = os.path.join(dataset_dir, 'validation')
val_files = sorted([os.path.join(val_path, x) for x in os.listdir(val_path) if x.endswith(".jpg")])
val_labels = [int(x.split('/')[-1].split('_')[0]) for x in val_files]
files = train_files + val_files
labels = train_labels + val_labels
stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=valid_ratio, random_state=0)
splits = stratified_split.split(files, labels)
train_split_id, val_split_id = next(iter(splits)) # train_split_id, val_split_id是index
df = pd.DataFrame({'files': files, 'labels': labels})
train_files = df.iloc[train_split_id]['files'].values
train_labels = df.iloc[train_split_id]['labels'].values
train_set = FoodDataset(train_files, train_labels, tfm=train_tfm)
train_loader = DataLoader(train_set, batch_size=batch_train, shuffle=True, num_workers=num_workers, pin_memory=True)
val_files = df.iloc[val_split_id]['files'].values
val_labels = df.iloc[val_split_id]['labels'].values
valid_set = FoodDataset(val_files, val_labels, tfm=train_tfm)
valid_loader = DataLoader(valid_set, batch_size=batch_valid, shuffle=True, num_workers=num_workers, pin_memory=True)
print('train集总长度是 {:d}, batch数量是 {:.2f}'.format(len(train_set), len(train_set)/ batch_train))
print('valid集总长度是 {:d}, batch数量是 {:.2f}'.format(len(valid_set), len(valid_set)/ batch_valid))
return train_loader, valid_loader