项目简介:
本项目旨在通过使用深度学习技术实现对图像进行分类。我采用了DDPM(Deep Dynamic Probabilistic Modeling)和InceptionNext两个模型的结合,以提高图像分类的准确性和性能。同时,我还使用了数据增强技术来增加训练数据的多样性,从而提升模型的泛化能力。
DDPM:
DDPM(Diffusion Models)DDPM 是基于扩散过程的生成模型,其中数据样本通过连续的扩散操作逐渐生成。在每个扩散步骤中,DDPM 通过对当前样本进行微小的扰动来生成下一个样本,这个扰动是基于一个预训练的神经网络模型进行采样得到的。通过多个步骤的迭代,DDPM 可以生成高质量、多样性的样本。DDPM 的一个关键优点是它能够生成高分辨率的图像样本,并在图像生成任务中取得了显著的成就。这种模型还可以用于图像修复、超分辨率、图像生成等应用领域。我将使用它对一部分数据集进行训练生成少量图像扩充数据集,增加鲁棒性。
网络构建:我使用的是基于Pytorch的DDOM框架:通过一个正向过程来将中间图像转化为目标图像。这个过程是一个条件生成过程,以中间图像为输入,通过迭代地添加噪声,生成与目标图像更接近的样本。
建立U-net网络:
class UNet(nn.Module):
def __init__(
self, img_channels, base_channels=128, channel_mults=(1, 2, 4, 8),
num_res_blocks=3, time_emb_dim=128 * 4, time_emb_scale=1.0, num_classes=None, activation=SiLU(),
dropout=0.1, attention_resolutions=(1,), norm="gn", num_groups=32, initial_pad=0,
):
super().__init__()
self.activation = activation
# 是否对输入进行padding
self.initial_pad = initial_pad
# 需要去区分的类别数
self.num_classes = num_classes
# 对时间轴输入的全连接层
self.time_mlp = nn.Sequential(
PositionalEmbedding(base_channels, time_emb_scale),
nn.Linear(base_channels, time_emb_dim),
SiLU(),
nn.Linear(time_emb_dim, time_emb_dim),
) if time_emb_dim is not None else None
# 对输入图片的第一个卷积
self.init_conv = nn.Conv2d(img_channels, base_channels, 3, padding=1)
# self.downs用于存储下采样用到的层,首先利用ResidualBlock提取特征
# 然后利用Downsample降低特征图的高宽
self.downs = nn.ModuleList()
self.ups = nn.ModuleList()
# channels指的是每一个模块处理后的通道数
# now_channels是一个中间变量,代表中间的通道数
channels = [base_channels]
now_channels = base_channels
for i, mult in enumerate(channel_mults):
out_channels = base_channels * mult
for _ in range(num_res_blocks):
self.downs.append(
ResidualBlock(
now_channels, out_channels, dropout,
time_emb_dim=time_emb_dim, num_classes=num_classes, activation=activation,
norm=norm, num_groups=num_groups, use_attention=i in attention_resolutions,
)
)
now_channels = out_channels
channels.append(now_channels)
if i != len(channel_mults) - 1:
self.downs.append(Downsample(now_channels))
channels.append(now_channels)
# 可以看作是特征整合,中间的一个特征提取模块
self.mid = nn.ModuleList(
[
ResidualBlock(
now_channels, now_channels, dropout,
time_emb_dim=time_emb_dim, num_classes=num_classes, activation=activation,
norm=norm, num_groups=num_groups, use_attention=True,
),
ResidualBlock(
now_channels, now_channels, dropout,
time_emb_dim=time_emb_dim, num_classes=num_classes, activation=activation,
norm=norm, num_groups=num_groups, use_attention=False,
),
]
)
# 进行上采样,进行特征融合
for i, mult in reversed(list(enumerate(channel_mults))):
out_channels = base_channels * mult
for _ in range(num_res_blocks + 1):
self.ups.append(ResidualBlock(
channels.pop() + now_channels, out_channels, dropout,
time_emb_dim=time_emb_dim, num_classes=num_classes, activation=activation,
norm=norm, num_groups=num_groups, use_attention=i in attention_resolutions,
))
now_channels = out_channels
if i != 0:
self.ups.append(Upsample(now_channels))
assert len(channels) == 0
self.out_norm = get_norm(norm, base_channels, num_groups)
self.out_conv = nn.Conv2d(base_channels, img_channels, 3, padding=1)
def forward(self, x, time=None, y=None):
# 是否对输入进行padding
ip = self.initial_pad
if ip != 0:
x = F.pad(x, (ip,) * 4)
# 对时间轴输入的全连接层
if self.time_mlp is not None:
if time is None:
raise ValueError("time conditioning was specified but tim is not passed")
time_emb = self.time_mlp(time)
else:
time_emb = None
if self.num_classes is not None and y is None:
raise ValueError("class conditioning was specified but y is not passed")
# 对输入图片的第一个卷积
x = self.init_conv(x)
# skips用于存放下采样的中间层
skips = [x]
for layer in self.downs:
x = layer(x, time_emb, y)
skips.append(x)
# 特征整合与提取
for layer in self.mid:
x = layer(x, time_emb, y)
# 上采样并进行特征融合
for layer in self.ups:
if isinstance(layer, ResidualBlock):
x = torch.cat([x, skips.pop()], dim=1)
x = layer(x, time_emb, y)
# 上采样并进行特征融合
x = self.activation(self.out_norm(x))
x = self.out_conv(x)
if self.initial_pad != 0:
return x[:, :, ip:-ip, ip:-ip]
else:
return x
Diffusion模型
class GaussianDiffusion(nn.Module):
def __init__(
self, model, img_size, img_channels, num_classes=None, betas=[], loss_type="l2", ema_decay=0.9999, ema_start=2000, ema_update_rate=1,
):
super().__init__()
self.model = model
self.ema_model = deepcopy(model)
self.ema = EMA(ema_decay)
self.ema_decay = ema_decay
self.ema_start = ema_start
self.ema_update_rate = ema_update_rate
self.step = 0
self.img_size = img_size
self.img_channels = img_channels
self.num_classes = num_classes
# l1或者l2损失
if loss_type not in ["l1", "l2"]:
raise ValueError("__init__() got unknown loss type")
self.loss_type = loss_type
self.num_timesteps = len(betas)
alphas = 1.0 - betas
alphas_cumprod = np.cumprod(alphas)
# 转换成torch.tensor来处理
to_torch = partial(torch.tensor, dtype=torch.float32)
# betas [0.0001, 0.00011992, 0.00013984 ... , 0.02]
self.register_buffer("betas", to_torch(betas))
# alphas [0.9999, 0.99988008, 0.99986016 ... , 0.98]
self.register_buffer("alphas", to_torch(alphas))
# alphas_cumprod [9.99900000e-01, 9.99780092e-01, 9.99640283e-01 ... , 4.03582977e-05]
self.register_buffer("alphas_cumprod", to_torch(alphas_cumprod))
# sqrt(alphas_cumprod)
self.register_buffer("sqrt_alphas_cumprod", to_torch(np.sqrt(alphas_cumprod)))
# sqrt(1 - alphas_cumprod)
self.register_buffer("sqrt_one_minus_alphas_cumprod", to_torch(np.sqrt(1 - alphas_cumprod)))
# sqrt(1 / alphas)
self.register_buffer("reciprocal_sqrt_alphas", to_torch(np.sqrt(1 / alphas)))
self.register_buffer("remove_noise_coeff", to_torch(betas / np.sqrt(1 - alphas_cumprod)))
self.register_buffer("sigma", to_torch(np.sqrt(betas)))
def update_ema(self):
self.step += 1
if self.step % self.ema_update_rate == 0:
if self.step < self.ema_start:
self.ema_model.load_state_dict(self.model.state_dict())
else:
self.ema.update_model_average(self.ema_model, self.model)
@torch.no_grad()
def remove_noise(self, x, t, y, use_ema=True):
if use_ema:
return (
(x - extract(self.remove_noise_coeff, t, x.shape) * self.ema_model(x, t, y)) *
extract(self.reciprocal_sqrt_alphas, t, x.shape)
)
else:
return (
(x - extract(self.remove_noise_coeff, t, x.shape) * self.model(x, t, y)) *
extract(self.reciprocal_sqrt_alphas, t, x.shape)
)
@torch.no_grad()
def sample(self, batch_size, device, y=None, use_ema=True):
if y is not None and batch_size != len(y):
raise ValueError("sample batch size different from length of given y")
x = torch.randn(batch_size, self.img_channels, *self.img_size, device=device)
for t in range(self.num_timesteps - 1, -1, -1):
t_batch = torch.tensor([t], device=device).repeat(batch_size)
x = self.remove_noise(x, t_batch, y, use_ema)
if t > 0:
x += extract(self.sigma, t_batch, x.shape) * torch.randn_like(x)
return x.cpu().detach()
@torch.no_grad()
def sample_diffusion_sequence(self, batch_size, device, y=None, use_ema=True):
if y is not None and batch_size != len(y):
raise ValueError("sample batch size different from length of given y")
x = torch.randn(batch_size, self.img_channels, *self.img_size, device=device)
diffusion_sequence = [x.cpu().detach()]
for t in range(self.num_timesteps - 1, -1, -1):
t_batch = torch.tensor([t], device=device).repeat(batch_size)
x = self.remove_noise(x, t_batch, y, use_ema)
if t > 0:
x += extract(self.sigma, t_batch, x.shape) * torch.randn_like(x)
diffusion_sequence.append(x.cpu().detach())
return diffusion_sequence
def perturb_x(self, x, t, noise):
return (
extract(self.sqrt_alphas_cumprod, t, x.shape) * x +
extract(self.sqrt_one_minus_alphas_cumprod, t, x.shape) * noise
)
def get_losses(self, x, t, y):
# x, noise [batch_size, 3, 64, 64]
noise = torch.randn_like(x)
perturbed_x = self.perturb_x(x, t, noise)
estimated_noise = self.model(perturbed_x, t, y)
if self.loss_type == "l1":
loss = F.l1_loss(estimated_noise, noise)
elif self.loss_type == "l2":
loss = F.mse_loss(estimated_noise, noise)
return loss
def forward(self, x, y=None):
b, c, h, w = x.shape
device = x.device
if h != self.img_size[0]:
raise ValueError("image height does not match diffusion parameters")
if w != self.img_size[0]:
raise ValueError("image width does not match diffusion parameters")
t = torch.randint(0, self.num_timesteps, (b,), device=device)
return self.get_losses(x, t, y)
def generate_cosine_schedule(T, s=0.008):
def f(t, T):
return (np.cos((t / T + s) / (1 + s) * np.pi / 2)) ** 2
alphas = []
f0 = f(0, T)
for t in range(T + 1):
alphas.append(f(t, T) / f0)
betas = []
for t in range(1, T + 1):
betas.append(min(1 - alphas[t] / alphas[t - 1], 0.999))
return np.array(betas)
def generate_linear_schedule(T, low, high):
return np.linspace(low, high, T)
训练函数:
if __name__ == "__main__":
Cuda = True
distributed = False
fp16 = True
diffusion_model_path = ""
channel = 64
schedule = "linear"
num_timesteps = 1000
schedule_low = 1e-4
schedule_high = 0.02
input_shape = (64, 64)
Init_Epoch = 0
Epoch = 200
batch_size = 16
Init_lr = 2e-4
Min_lr = Init_lr * 0.01
optimizer_type = "adamw"
momentum = 0.9
weight_decay = 0
lr_decay_type = "cos"
save_period = 25
save_dir = 'logs'
num_workers = 0
annotation_path = "train_lines.txt"
ngpus_per_node = torch.cuda.device_count()
if distributed:
dist.init_process_group(backend="nccl")
local_rank = int(os.environ["LOCAL_RANK"])
rank = int(os.environ["RANK"])
device = torch.device("cuda", local_rank)
if local_rank == 0:
print(f"[{os.getpid()}] (rank = {rank}, local_rank = {local_rank}) training...")
print("Gpu Device Count : ", ngpus_per_node)
else:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
local_rank = 0
if schedule == "cosine":
betas = generate_cosine_schedule(num_timesteps)
else:
betas = generate_linear_schedule(
num_timesteps,
schedule_low * 1000 / num_timesteps,
schedule_high * 1000 / num_timesteps,
)
diffusion_model = GaussianDiffusion(UNet(3, channel), input_shape, 3, betas=betas)
if diffusion_model_path != '':
model_dict = diffusion_model.state_dict()
pretrained_dict = torch.load(diffusion_model_path, map_location=device)
pretrained_dict = {k: v for k, v in pretrained_dict.items() if np.shape(model_dict[k]) == np.shape(v)}
model_dict.update(pretrained_dict)
diffusion_model.load_state_dict(model_dict)
if local_rank == 0:
time_str = datetime.datetime.strftime(datetime.datetime.now(),'%Y_%m_%d_%H_%M_%S')
log_dir = os.path.join(save_dir, "loss_" + str(time_str))
loss_history = LossHistory(log_dir, [diffusion_model], input_shape=input_shape)
else:
loss_history = None
if fp16:
from torch.cuda.amp import GradScaler as GradScaler
scaler = GradScaler()
else:
scaler = None
diffusion_model_train = diffusion_model.train()
if Cuda:
if distributed:
diffusion_model_train = diffusion_model_train.cuda(local_rank)
diffusion_model_train = torch.nn.parallel.DistributedDataParallel(diffusion_model_train, device_ids=[local_rank], find_unused_parameters=True)
else:
cudnn.benchmark = True
diffusion_model_train = torch.nn.DataParallel(diffusion_model)
diffusion_model_train = diffusion_model_train.cuda()
with open(annotation_path) as f:
lines = f.readlines()
num_train = len(lines)
if local_rank == 0:
show_config(
input_shape = input_shape, Init_Epoch = Init_Epoch, Epoch = Epoch, batch_size = batch_size, \
Init_lr = Init_lr, Min_lr = Min_lr, optimizer_type = optimizer_type, momentum = momentum, lr_decay_type = lr_decay_type, \
save_period = save_period, save_dir = save_dir, num_workers = num_workers, num_train = num_train
)
if True:
optimizer = {
'adam' : optim.Adam(diffusion_model_train.parameters(), lr=Init_lr, betas=(momentum, 0.999), weight_decay = weight_decay),
'adamw' : optim.AdamW(diffusion_model_train.parameters(), lr=Init_lr, betas=(momentum, 0.999), weight_decay = weight_decay),
}[optimizer_type]
lr_scheduler_func = get_lr_scheduler(lr_decay_type, Init_lr, Min_lr, Epoch)
epoch_step = num_train // batch_size
if epoch_step == 0:
raise ValueError("数据集过小,无法进行训练,请扩充数据集。")
train_dataset = DiffusionDataset(lines, input_shape)
if distributed:
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, shuffle=True,)
batch_size = batch_size // ngpus_per_node
shuffle = False
else:
train_sampler = None
shuffle = True
gen = DataLoader(train_dataset, shuffle=shuffle, batch_size=batch_size, num_workers=num_workers, pin_memory=True,
drop_last=True, collate_fn=Diffusion_dataset_collate, sampler=train_sampler)
for epoch in range(Init_Epoch, Epoch):
if distributed:
train_sampler.set_epoch(epoch)
set_optimizer_lr(optimizer, lr_scheduler_func, epoch)
fit_one_epoch(diffusion_model_train, diffusion_model, loss_history, optimizer,
epoch, epoch_step, gen, Epoch, Cuda, fp16, scaler, save_period, save_dir, local_rank)
if distributed:
dist.barrier()
训练结果:
将生成好的图像填充到我的植物数据集中,准备接下来的图像分类任务。