(二)CGAN实战-按条件生成MNIST手写数字
一、CGAN论文要点
- 普通GAN的目标函数:
CGAN目标函数:
- CGAN网络结构示例:
二、MNIST实战思想
我主要是这样想的:
首先是训练判别器,通过把手写数字图片(32*32)和其对应的标签(经过嵌入层,将该数字对应的标签嵌入成10维向量)一起输入到判别器作为真实图片输入;将随机的latent vector (100维) 加上随机的标签一同输入生成器输出fake图片,再将fake图片连同之前输入生成器的标签一同输入判别器,作为判别器评价fake图片的来源。然后再训练生成器,将随机的latent vector (100维) 加上随机的标签一同输入生成器产生图片,再将图片连同之前的标签一起输入判别器,交给其评价。
MNIST数据集部分图片展示:已经预处理为32 * 32
三、完整代码(详细注释)
import argparse
import os
import numpy as np
import torchvision.transforms as transforms
from torchvision.utils import save_image
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.functional as F
import torch
import torchvision
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter
import matplotlib.animation as animation
from IPython.display import HTML
from torch.autograd import Variable
import skimage.io as io
parser = argparse.ArgumentParser()
parser.add_argument('--dataroot',default='../dataset', help='root path for dataset')
parser.add_argument('--epochs', type = int, default=200, help='number of epochs of training')
parser.add_argument('--batch_size', type = int, default=128, help='size of the batchs')
parser.add_argument('--lr', type = float, default=0.0002, help='Adam: learning rate')
parser.add_argument('--beta1', type = float, default=0.5, help='adam: decay of first order momentum of gradient')
parser.add_argument('--beta2', type=float, default=0.999, help='adam: decay of first order momentum of gradient')
parser.add_argument('--n_gpu', type = int, default=1, help='number of gpu threads to use during batch generation')
parser.add_argument('--latent_dim', type = int, default=100, help='dimensionality of the latent space')
parser.add_argument('--n_classes', type = int, default=10, help='number of classes for dataset')
parser.add_argument('--img_size', type=int, default=32, help='size of each image dimension')
parser.add_argument('--channels', type = int, default=1, help='numbers of image channels')
parser.add_argument('--sample_interval', type=int, default=400, help='interval between image sampling')
args = parser.parse_args(args=[])
print(args)
img_shape = (args.channels, args.img_size, args.img_size)
np.random.seed(999)
torch.manual_seed(999)
device = torch.device('cuda: 0' if (torch.cuda.is_available() and args.n_gpu >0) else 'cpu')
# create the dataset
transform = transforms.Compose([
transforms.Resize(args.img_sizea),
transforms.ToTensor(),
transforms.Normalize((0.5a),(0.5)) # 这里一定要改为一个0.5,表示单通道
])
dataset = torchvision.datasets.MNIST(root = args.dataroot,train=True, transform=transform, download=False)
dataloader = DataLoader(dataset = dataset, batch_size=args.batch_size, shuffle = True, num_workers=2)
# 查看图片
# sample = next(iter(dataloadr))
# plt.figure(figsize = (8, 8))
# plt.axis('off')
# plt.title('MNIST SAMPLE')
# plt.imshow(np.transpose(torchvision.utils.make_grid(sample[0][:64], normalize = True), (1,2,0)))
# ======================
# Generator
#=======================
class Generator(nn.Module):
def __init__(self):
super(Generator, self).__init__()
# num_embeddings表示词典的词的数量,embedding_dim表示一个词表示成10个维度
self.embedding = nn.Embedding(num_embeddings = args.n_classes,embedding_dim = 10)
self.model = nn.Sequential(
# 输入的特征数量是100+10,100表示隐向量长度,10表示该数字的label经过嵌入层后的维度10
nn.Linear(args.latent_dim+10, 128),
nn.BatchNorm1d(128),
nn.LeakyReLU(0.2, inplace=True),
nn.Linear(128, 256),
nn.BatchNorm1d(256),
nn.LeakyReLU(0.2, inplace=True),
nn.Linear(256, 512),
nn.BatchNorm1d(512),
nn.LeakyReLU(0.2, inplace=True),
nn.Linear(512, 1024),
nn.BatchNorm1d(1024),
nn.LeakyReLU(0.2, inplace=True),
# np.prod()函数:计算给定数组的元素的乘积,axis指定了方向,1表示行的乘积,默认计算所有元素乘积,返回值是一个array数组
nn.Linear(1024, int(np.prod(img_shape))),
nn.Tanh()
)
def forward(self, noise, labels):
# Concatenate label embedding and image to produce input
# 这里一定要先把label转为LongTensor,因为embedding只接受LongTensor类型数据
labels = self.embedding(labels)
labels = labels.view(labels.size(0), -1)
gen_input = torch.cat((labels, noise.view(noise.size(0), -1)), dim = -1) # --> [batch_size, 110]
img = self.model(gen_input) # --> torch.Size([batch_size, 3, 32, 32])
img = img.view(img.size(0), *img_shape) # -->[1,32,32] 32*32 = 1024
return img
# ======================
# Discriminator
#=======================
class Discrimminator(nn.Module):
def __init__(self):
super(Discrimminator, self).__init__()
self.embedding = nn.Embedding(num_embeddings=args.n_classes, embedding_dim=10)
self.model = nn.Sequential(
nn.Linear(int(np.prod(img_shape)) + 10, 512),
nn.Dropout(p = 0.5, inplace=True),
nn.BatchNorm1d(512),
nn.LeakyReLU(0.2, inplace=True),
nn.Linear(512, 256),
nn.Dropout(p = 0.5, inplace=True),
nn.BatchNorm1d(256),
nn.LeakyReLU(0.2, inplace=True),
nn.Linear(256, 128),
nn.Dropout(p = 0.5, inplace=True),
nn.BatchNorm1d(128),
nn.LeakyReLU(0.2, inplace=True),
nn.Linear(128, 1),
nn.Sigmoid()
)
def forward(self, img, labels):
# Concatenate label embedding and image to produce input
labels = self.embedding(labels)
labels = labels.view(labels.size(0), -1)
d_input = torch.cat((img.view(img.size(0), -1), labels) ,dim = -1) # --> [batch_size, 1034]
result = self.model(d_input)
return result
netG = Generator().to(device)
netD = Discrimminator().to(device)
print(netG)
print(netD)
# 在tensorboard中显示网络结构
writer = SummaryWriter('runs/CGAN/discriminator')
noise = torch.randn(128, args.latent_dim, 1, 1, device = device)
noise_with_labels = torch.randint(0, args.n_classes, size = (128, 1, 1), device = device)
noise_with_labels = torch.cuda.LongTensor(noise_with_labels)
dataiter = iter(dataloader)
images, labels = next(dataiter)
labels = labels.to(device)
images = images.to(device)
labels = torch.cuda.LongTensor(labels)
with writer as w:
# w.add_graph(netG, (noise, noise_with_labels))
w.add_graph(netD, (images, labels)) #一个文件夹下存储的网络结构文件只能显示最开始的,所以我这里把这两个网络结构分别保存在
#generator和discriminator文件夹下
real = 1.0
fake = 0.0
criterion = nn.BCELoss()
# Optimizers
optimizer_G = torch.optim.Adam(netG.parameters(), lr = args.lr, betas=(args.beta1, args.beta2))
optimizer_D = torch.optim.Adam(netD.parameters(), lr = args.lr, betas=(args.beta1, args.beta2))
print("Starting Training Loop...")
# ----------
# Training
# ----------
for epoch in range(args.epochs):
D_real = 0.0
D_G_1 = 0.0
D_G_2 = 0.0
d_loss = 0.0
g_loss = 0.0
for i ,(imgs, labels) in enumerate(dataloader):
b_size = imgs.shape[0]
# Adversarial ground truths
real_label = torch.full((b_size, ), real, device = device)
fake_label = torch.full((b_size,), fake, device = device)
# 产生假图片
noise = torch.randn(b_size, args.latent_dim, 1, 1, device = device)
noise_with_labels = torch.randint(0, args.n_classes, size = (b_size, 1, 1), device = device)
noise_with_labels = torch.cuda.LongTensor(noise_with_labels)
fake_images = netG(noise, noise_with_labels)
real_imgs = imgs.to(device)
labels = labels.to(device)
# -----------------
# Train Discriminator
# -----------------
netD.zero_grad()
# Loss for real images
output_real = netD(real_imgs, labels)
d_real_loss = criterion(output_real, real_label)
D_real = output_real.mean().item()
# Loss for fake images
output_fake = netD(fake_images, noise_with_labels)
d_fake_loss = criterion(output_fake, fake_label)
D_G_1 = output_fake.mean().item()
# Total discriminator loss
d_loss = (d_real_loss + d_fake_loss)/2
d_loss.backward()
optimizer_D.step()
# -----------------
# Train Generator
# -----------------
netG.zero_grad()
fake_images = netG(noise, noise_with_labels)
output = netD(fake_images, noise_with_labels)
g_loss = criterion(output, real_label)
D_G_2 = output.mean().item()
g_loss.backward()
optimizer_G.step()
# Output training stats
print('[{}/{}] Loss_G:{:.4f} Loss_D:{:.4f} D_to_True:{:.4f} D_to_fake:{:.4f} D_to_G:{:.4f}'.format(
epoch, args.epochs, g_loss, d_loss, D_real, D_G_1, D_G_2))
writer.add_scalar('G_LOSS', g_loss, global_step = epoch)
writer.add_scalar('D_LOSS', d_loss, global_step = epoch)
if epoch % 5 == 0:
with torch.no_grad():
"""Saves a grid of generated digits ranging from 0 to n_classes"""
# Sample noise
z = torch.randn(100, args.latent_dim, 1, 1, device = device)
# Get labels ranging from 0 to n_classes for n rows
labels = torch.tensor([num for _ in range(10) for num in range(10)])
labels = torch.reshape(labels, (-1, 1, 1))
labels = labels.to(device)
labels = torch.cuda.LongTensor(labels)
gen_images = netG(z, labels) # output-->[batch_size, 1, 32, 32]
save_image(gen_images.data, 'Picture/{}.png'.format(epoch), nrow = 10, normalize=True)
img_list.append(torchvision.utils.make_grid(gen_images, normalize = True))
# do checkpointing 只保存参数
torch.save(netG.state_dict(), 'Model/netG_epoch_{}.pth'.format(epoch))
torch.save(netD.state_dict(), 'Model/netD_epoch_{}.pth'.format(epoch))
# 在tensorboard中显示网络结构
noise = torch.randn(64, args.latent_dim, 1, 1, device = device)
noise_with_labels = torch.randint(0, args.n_classes, size = (64, 1, 1), device = device)
noise_with_labels = torch.cuda.LongTensor(noise_with_labels)
dataiter = iter(dataloader)
images, labels = next(dataiter)
with writer as w:
w.add_graph(netG, noise, noise_with_labels)
w.add_graph(netD, images, labels)
# 动画演示:
img_list = []
for i in range(15):
temp = io.imread('./Picture/{}.png'.format(i*5))
img_list.append(temp)
# 动画演示
fig = plt.figure(figsize=(8,8))
plt.axis("off")
ims = [[plt.imshow(i, animated=True)] for i in img_list]
ani = animation.ArtistAnimation(fig, ims, interval=1000, repeat_delay=1000, blit=True)
HTML(ani.to_jshtml())
四、结果展示
到75轮差不多已经收敛。
epoch = 5时:
epoch = 25时:
epoch = 75时:
部分训练:
五、遇到的问题及解决
一、nn.Linear()层的输入不能是(batch_size, C, H, W)的形式,最后一个维度必须和Linear层的输入的特征节点数匹配(要先进行reshape),例如:
model = nn.Sequential(
nn.Linear(256, 128)
)
a = torch.randn(128, 3, 32 , 32) #[16 x 2], m2: [32 x 64]
a = torch.reshape(a, (-1, 256))
b = model(a)
b.shape
输出: torch.Size([1536, 128])
二、nn.Embedding()层的输入必须要是LongTensor类型的数据,如果输入的数据是gpu中运行的,则为torch.cuda.LongTensor,如果输入的数据是cpu运行的,则为torch.LongTensor,总之,一定要对应好设备,例如:
----第一种情况:在cpu中运行的数据----
device = torch.device('cuda: 0' if (torch.cuda.is_available()) else 'cpu')
model = nn.Sequential(
nn.Embedding(10, 3)
)
a = torch.tensor([1,2,3])
output = model(a)
output.size()
输出:torch.Size([3, 3])
----第二种情况:在GPU中运行的数据----
device = torch.device('cuda: 0' if (torch.cuda.is_available()) else 'cpu')
model = nn.Sequential(
nn.Embedding(10, 3)
)
model.to(device)
a = torch.cuda.LongTensor(torch.tensor([1,2,3,4],device = device))
output = model(a)
output.size()
输出:torch.Size([4, 3])
有一个bug:
# 如果是从ndarray数组转为tensor再进行就会报错:
# TypeError: expected Long (got Int)
labels = np.array([1,2,3])
labels = torch.from_numpy(labels)
labels = labels.to(device)
labels = torch.cuda.LongTensor(labels)
下面就不会报错:
labels = torch.tensor([1,2,3])
labels = labels.to(device)
labels = torch.cuda.LongTensor(labels)
三、动画演示:
# img_list里面是列表再嵌套Artist列表
img_list = []
for i in range(15):
temp = io.imread('./Picture/{}.png'.format(i*5))
img_list.append(temp)
# 动画演示
fig = plt.figure(figsize=(8,8))
plt.axis("off")
ims = [[plt.imshow(i, animated=True)] for i in img_list]
ani = animation.ArtistAnimation(fig, ims, interval=1000, repeat_delay=1000, blit=True)
HTML(ani.to_jshtml())
四、报错:TypeError: init() takes 1 positional argument but 3 were given
这个错误是是没有实例化网络或者没有通过实例化网络对象来调用,直接调用类来进行forward。
神经网络构建的开头部分:
class AutoEncoder(nn.Module):
def __init__(self):
super(AutoEncoder, self).__init__()
正常实例化一个简单的神经网络应下面这样的,注意是没有参数的,这和__init__是一样的,或者只有一个self参数:
auto=AutoEncoder()
如果把类当成 了对象直接调用,就会是这样:
out=AutoEncoder(input)
正确做法是:
out = auto(input)