循环GAN(CycleGAN)是一种生成对抗网络(GAN)的变体,它能够学习两个不同域(例如,照片和艺术作品)之间的映射,而无需成对的训练数据。CycleGAN由两个生成器(G_X和G_Y)和两个判别器(D_X和D_Y)组成,其中G_X将图像从域X转换到域Y,而G_Y将图像从域Y转换回域X。
以下是CycleGAN的主要组成部分:
-
生成器(Generator):
- G_X: 将图像从域X转换到域Y。
- G_Y: 将图像从域Y转换回域X。
-
判别器(Discriminator):
- D_X: 区分真实图像和G_X生成的图像。
- D_Y: 区分真实图像和G_Y生成的图像。
-
循环一致性损失(Cycle Consistency Loss):
- 确保生成器能够将图像从源域转换到目标域,然后将图像从目标域转换回源域,同时保持原始图像的原始特征。
-
身份保持损失(Identity Preservation Loss):
- 确保生成器在转换图像时能够保持图像的原始特征。
CycleGAN的主要优点是它能够学习两个不同域之间的映射,而无需成对的训练数据。这使得CycleGAN在图像到图像的转换任务中非常有用,例如,将真实照片转换为艺术作品,或将艺术作品转换为真实照片。
在训练过程中,CycleGAN使用生成器和判别器之间的对抗性来提高生成图像的质量。生成器试图生成逼真的图像来欺骗判别器,而判别器试图区分真实图像和生成图像。通过这种方式,生成器逐渐学习到生成逼真图像的技巧,从而提高了模型性能。
导入包和部署TPU
导入包
import os, cv2, re,logging, warnings, functools, PIL, shutil
from kaggle_datasets import KaggleDatasets
import random
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.losses import BinaryCrossentropy
import numpy as np
import pandas as pd
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import matplotlib
部署TPU
挺奇怪的,在广州用TPU一直要排队,搁家里都不用排,直接就用
try:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
print(f'Running on TPU {tpu.master()}')
except ValueError:
tpu = None
if tpu:
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.TPUStrategy(tpu)
else:
strategy = tf.distribute.get_strategy()
REPLICAS = strategy.num_replicas_in_sync
print(f'REPLICAS: {REPLICAS}')
导入数据和构建Cycle gan
导入数据
AUTO= tf.data.experimental.AUTOTUNE
MONET_FILENAMES = tf.io.gfile.glob('/kaggle/input/gan-getting-started/monet_tfrec/*.tfrec')
PHOTO_FILENAMES = tf.io.gfile.glob('/kaggle/input/gan-getting-started/photo_tfrec/*.tfrec')
def count_data_items(filenames):
n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
for filename in filenames[:20]:
print("Sample filenames:", filename)
return np.sum(n)
n_monet_samples = count_data_items(MONET_FILENAMES)
n_photo_samples = count_data_items(PHOTO_FILENAMES)
定义各种函数和类
用于解码和进行归一化
def decode_and_rescale(image):
image = tf.image.decode_jpeg(image, channels=3)
image = (tf.cast(image, tf.float32) / 127.5) - 1
image = tf.reshape(image, [256, 256, 3])
return image
转换格式
def read_tfrecord(example):
tfrecord_format = {
'image_name': tf.io.FixedLenFeature([], tf.string),
'image': tf.io.FixedLenFeature([], tf.string),
'target': tf.io.FixedLenFeature([], tf.string)
}
example = tf.io.parse_single_example(example, tfrecord_format)
image = decode_and_rescale(example['image'])
return image
载入数据集
def load_dataset(filenames, labeled=True, ordered=False):
dataset = tf.data.TFRecordDataset(filenames)
dataset = dataset.map(read_tfrecord, num_parallel_calls=AUTO)
return dataset
定义单元归一化类
#自定义
class InstanceNormalization(tf.keras.layers.Layer):
def __init__(self, epsilon=1e-5):
super(InstanceNormalization, self).__init__()
self.epsilon = epsilon
def build(self, input_shape):
self.scale = self.add_weight(
name='scale',
shape=input_shape[-1:],
initializer=tf.random_normal_initializer(1., 0.02),
trainable=True)
self.offset = self.add_weight(
name='offset',
shape=input_shape[-1:],
initializer='zeros',
trainable=True)
def call(self, x):
mean, variance = tf.nn.moments(x, axes=[1, 2], keepdims=True)
inv = tf.math.rsqrt(variance + self.epsilon)
normalized = (x - mean) * inv
return self.scale * normalized + self.offset
这个因为现在都升级到keras3了,有些代码已经不兼容了,我参考的notebook都4年前的东西了,本来这个是有简写为tfa的模块可以完成这个工作的,但是我下载之后用的时候一直报错,只能放弃,刚好看到其他notebook有这个类,就直接拿过来用了
可微数据增强
#可微数据增强
with strategy.scope():
def DiffAugment(x, policy='', channels_first=False):
if policy:
if channels_first:
x = tf.transpose(x, [0, 2, 3, 1])
for p in policy.split(','):
for f in AUGMENT_FNS[p]:
x = f(x)
if channels_first:
x = tf.transpose(x, [0, 3, 1, 2])
return x
def rand_brightness(x):
magnitude = tf.random.uniform([tf.shape(x)[0], 1, 1, 1]) - 0.5
x = x + magnitude
return x
def rand_saturation(x):
magnitude = tf.random.uniform([tf.shape(x)[0], 1, 1, 1]) * 2
x_mean = tf.reduce_sum(x, axis=3, keepdims=True) * 0.3333333333333333333
x = (x - x_mean) * magnitude + x_mean
return x
def rand_contrast(x):
magnitude = tf.random.uniform([tf.shape(x)[0], 1, 1, 1]) + 0.5
x_mean = tf.reduce_sum(x, axis=[1, 2, 3], keepdims=True) * 5.086e-6
x = (x - x_mean) * magnitude + x_mean
return x
def rand_translation(x, ratio=0.125):
batch_size = tf.shape(x)[0]
image_size = tf.shape(x)[1:3]
shift = tf.cast(tf.cast(image_size, tf.float32) * ratio + 0.5, tf.int32)
translation_x = tf.random.uniform([batch_size, 1], -shift[0], shift[0] + 1, dtype=tf.int32)
translation_y = tf.random.uniform([batch_size, 1], -shift[1], shift[1] + 1, dtype=tf.int32)
grid_x = tf.clip_by_value(tf.expand_dims(tf.range(image_size[0], dtype=tf.int32), 0) + translation_x + 1, 0, image_size[0] + 1)
grid_y = tf.clip_by_value(tf.expand_dims(tf.range(image_size[1], dtype=tf.int32), 0) + translation_y + 1, 0, image_size[1] + 1)
x = tf.gather_nd(tf.pad(x, [[0, 0], [1, 1], [0, 0], [0, 0]]), tf.expand_dims(grid_x, -1), batch_dims=1)
x = tf.transpose(tf.gather_nd(tf.pad(tf.transpose(x, [0, 2, 1, 3]), [[0, 0], [1, 1], [0, 0], [0, 0]]), tf.expand_dims(grid_y, -1), batch_dims=1), [0, 2, 1, 3])
return x
def rand_cutout(x, ratio=0.5):
batch_size = tf.shape(x)[0]
image_size = tf.shape(x)[1:3]
cutout_size = tf.cast(tf.cast(image_size, tf.float32) * ratio + 0.5, tf.int32)
offset_x = tf.random.uniform([tf.shape(x)[0], 1, 1], maxval=image_size[0] + (1 - cutout_size[0] % 2), dtype=tf.int32)
offset_y = tf.random.uniform([tf.shape(x)[0], 1, 1], maxval=image_size[1] + (1 - cutout_size[1] % 2), dtype=tf.int32)
grid_batch, grid_x, grid_y = tf.meshgrid(tf.range(batch_size, dtype=tf.int32), tf.range(cutout_size[0], dtype=tf.int32), tf.range(cutout_size[1], dtype=tf.int32), indexing='ij')
cutout_grid = tf.stack([grid_batch, grid_x + offset_x - cutout_size[0] // 2, grid_y + offset_y - cutout_size[1] // 2], axis=-1)
mask_shape = tf.stack([batch_size, image_size[0], image_size[1]])
cutout_grid = tf.maximum(cutout_grid, 0)
cutout_grid = tf.minimum(cutout_grid, tf.reshape(mask_shape - 1, [1, 1, 1, 3]))
mask = tf.maximum(1 - tf.scatter_nd(cutout_grid, tf.ones([batch_size, cutout_size[0], cutout_size[1]], dtype=tf.float32), mask_shape), 0)
x = x * tf.expand_dims(mask, axis=3)
return x
def data_augment_flip(image):
image = tf.image.random_flip_left_right(image)
return image
AUGMENT_FNS = {
'color': [rand_brightness, rand_saturation, rand_contrast],
'translation': [rand_translation],
'cutout': [rand_cutout],
}
def aug_fn(image):
return DiffAugment(image,"color,translation,cutout")
def data_augment_color(image):
image = tf.image.random_flip_left_right(image)
image = DiffAugment(image,"color")
return image
def data_augment_flip(image):
image = tf.image.random_flip_left_right(image)
return image
可微数据增强(Differentiable Data Augmentation)是一种技术,它允许在训练深度学习模型时自动调整数据增强策略。这种方法通常通过在网络中添加额外的层来实现,这些层专门用于应用数据增强操作,并且这些操作的参数可以在训练过程中通过反向传播进行调整。
例如,在图像分类任务中,模型可能会学习旋转角度或缩放比例,以便在训练过程中自动调整这些参数以提高模型的性能。这种方法可以提高模型的泛化能力,因为它能够学习到对特定任务有用的增强策略。
这个可以自动进行数据增强策略的优化,用一般的数据增强我评分只到50分,我看排名靠前的基本都用了,但我自己用了没什么提升,效果还不如普通的数据增强策略
获得完整数据集用于训练
def get_gan_dataset(monet_files, photo_files, augment=None, repeat=True, shuffle=True, batch_size=1):
monet_ds = load_dataset(monet_files)
photo_ds = load_dataset(photo_files)
if repeat:
monet_ds = monet_ds.repeat()
photo_ds = photo_ds.repeat()
if shuffle:
monet_ds = monet_ds.shuffle(2048)
photo_ds = photo_ds.shuffle(2048)
monet_ds = monet_ds.batch(batch_size, drop_remainder=True)
photo_ds = photo_ds.batch(batch_size, drop_remainder=True)
if augment:
monet_ds = monet_ds.map(augment, num_parallel_calls=AUTO)
photo_ds = photo_ds.map(augment, num_parallel_calls=AUTO)
monet_ds = monet_ds.prefetch(AUTO)
photo_ds = photo_ds.prefetch(AUTO)
gan_ds = tf.data.Dataset.zip((monet_ds, photo_ds))
return gan_ds
下采样和上采样函数
def downsample(filters, size, apply_instancenorm=True):
initializer = tf.random_normal_initializer(0., 0.02)
gamma_init = keras.initializers.RandomNormal(mean=0.0, stddev=0.02)
result = keras.Sequential()
result.add(layers.Conv2D(filters, size, strides=2, padding='same',
kernel_initializer=initializer, use_bias=False))
if apply_instancenorm:
result.add(InstanceNormalization())
result.add(layers.LeakyReLU())
return result
def upsample(filters, size, apply_dropout=False):
initializer = tf.random_normal_initializer(0., 0.02)
result = keras.Sequential()
result.add(layers.Conv2DTranspose(filters, size, strides=2,
padding='same',
kernel_initializer=initializer,
use_bias=False))
result.add(InstanceNormalization())
if apply_dropout:
result.add(layers.Dropout(0.5))
result.add(layers.ReLU())
return result
生成器和判别器
def Generator():
inputs = layers.Input(shape=[256,256,3])
down_stack = [
downsample(64, 4, apply_instancenorm=False), # (bs, 128, 128, 64)
downsample(128, 4), # (bs, 64, 64, 128)
downsample(256, 4), # (bs, 32, 32, 256)
downsample(512, 4), # (bs, 16, 16, 512)
downsample(512, 4), # (bs, 8, 8, 512)
downsample(512, 4), # (bs, 4, 4, 512)
downsample(512, 4), # (bs, 2, 2, 512)
downsample(512, 4), ]# (bs, 1, 1, 512)]
up_stack = [
upsample(512, 4, apply_dropout=True), # (bs, 2, 2, 1024)
upsample(512, 4, apply_dropout=True), # (bs, 4, 4, 1024)
upsample(512, 4, apply_dropout=True), # (bs, 8, 8, 1024)
upsample(512, 4), # (bs, 16, 16, 1024)
upsample(256, 4), # (bs, 32, 32, 512)
upsample(128, 4), # (bs, 64, 64, 256)
upsample(64, 4), # (bs, 128, 128, 128)
]
initializer = tf.random_normal_initializer(0., 0.02)
last = layers.Conv2DTranspose(OUTPUT_CHANNELS, 4,
strides=2,
padding='same',
kernel_initializer=initializer,
activation='tanh') # (bs, 256, 256, 3)
x = inputs
skips = []
for down in down_stack:
x = down(x)
skips.append(x)
skips = reversed(skips[:-1])
for up, skip in zip(up_stack, skips):
x = up(x)
x = layers.Concatenate()([x, skip])
x = last(x)
return keras.Model(inputs=inputs, outputs=x)
def Discriminator():
initializer = tf.random_normal_initializer(0., 0.02)
gamma_init = keras.initializers.RandomNormal(mean=0.0, stddev=0.02)
inp = layers.Input(shape=[256, 256, 3], name='input_image')
x = inp
down1 = downsample(64, 4, False)(x) # (bs, 128, 128, 64)
down2 = downsample(128, 4)(down1) # (bs, 64, 64, 128)
down3 = downsample(256, 4)(down2) # (bs, 32, 32, 256)
zero_pad1 = layers.ZeroPadding2D()(down3) # (bs, 34, 34, 256)
conv = layers.Conv2D(512, 4, strides=1,
kernel_initializer=initializer,
use_bias=False)(zero_pad1) # (bs, 31, 31, 512)
norm1 = InstanceNormalization()(conv)
leaky_relu = layers.LeakyReLU()(norm1)
zero_pad2 = layers.ZeroPadding2D()(leaky_relu) # (bs, 33, 33, 512)
last = layers.Conv2D(1, 4, strides=1,
kernel_initializer=initializer)(zero_pad2) # (bs, 30, 30, 1)
return tf.keras.Model(inputs=inp, outputs=last)
导入数据和设置参数
导入数据
monet_ds = load_dataset(MONET_FILENAMES, labeled=True).batch(1)
photo_ds = load_dataset(PHOTO_FILENAMES, labeled=True).batch(1)
定义生成器和判别器
BATCH_SIZE = 128
EPOCHS_NUM = 28
full_dataset = get_gan_dataset(MONET_FILENAMES, PHOTO_FILENAMES, augment=data_augment_flip, repeat=True, shuffle=False, batch_size=BATCH_SIZE)
OUTPUT_CHANNELS = 3
with strategy.scope():
monet_generator = Generator() # transforms photos to Monet-esque paintings
photo_generator = Generator() # transforms Monet paintings to be more like photos
monet_discriminator = Discriminator() # differentiates real Monet paintings and generated Monet paintings
photo_discriminator = Discriminator()
Cycle gan需要两个判别器和两个生成器,同时有四种损失,包括判别器损失,生成器损失,循环一致性损失,身份损失,域与域之间的联系靠的循环一致性损失
-
判别器损失(Discriminator Loss):
- 二进制交叉熵损失(Binary Cross-Entropy Loss): 判别器的目标是区分真实数据和生成数据。它对真实数据预测为1,对生成数据预测为0。判别器的损失函数通常使用二进制交叉熵,其输出值在0到1之间,接近1表示判别器预测的是真实数据,接近0表示判别器预测的是生成数据。
-
生成器损失(Generator Loss):
- 二进制交叉熵损失(Binary Cross-Entropy Loss): 生成器的损失函数通常与判别器的损失函数相反,即生成器的目标是欺骗判别器,使其将生成数据预测为真实数据。因此,生成器的损失函数也使用二进制交叉熵,其输出值在0到1之间,接近1表示生成器生成的数据更接近真实数据,接近0表示生成器生成的数据更接近判别器预测的生成数据。
-
循环一致性损失(Cycle Consistency Loss):
- L1范数损失(L1 Loss): 循环一致性损失用于确保生成器在转换图像时能够保持图像的原始特征。它计算生成器将图像从源域转换到目标域,然后将图像从目标域转换回源域的损失,确保两个转换过程的输出尽可能接近原始输入。
-
身份保持损失(Identity Preservation Loss):
- L1范数损失(L1 Loss): 身份保持损失用于确保生成器在转换图像时能够保持图像的原始特征。它计算生成器将图像从源域转换到目标域的损失,确保转换后的图像尽可能接近原始输入。
这是从gpt直接抄的,本来还有个论文专门解释的,我当时忘了收藏
定义cycle gan
class CycleGan(keras.Model):
def __init__(
self,
monet_generator,
photo_generator,
monet_discriminator,
photo_discriminator,
lambda_cycle=10,
):
super(CycleGan, self).__init__()
self.m_gen = monet_generator
self.p_gen = photo_generator
self.m_disc = monet_discriminator
self.p_disc = photo_discriminator
self.lambda_cycle = lambda_cycle
def compile(
self,
m_gen_optimizer,
p_gen_optimizer,
m_disc_optimizer,
p_disc_optimizer,
gen_loss_fn,
disc_loss_fn,
cycle_loss_fn,
identity_loss_fn
):
super(CycleGan, self).compile()
self.m_gen_optimizer = m_gen_optimizer
self.p_gen_optimizer = p_gen_optimizer
self.m_disc_optimizer = m_disc_optimizer
self.p_disc_optimizer = p_disc_optimizer
self.gen_loss_fn = gen_loss_fn
self.disc_loss_fn = disc_loss_fn
self.cycle_loss_fn = cycle_loss_fn
self.identity_loss_fn = identity_loss_fn
def train_step(self, batch_data):
real_monet, real_photo = batch_data
batch_size = tf.shape(real_monet)[0]
with tf.GradientTape(persistent=True) as tape:
fake_monet = self.m_gen(real_photo, training=True)
cycled_photo = self.p_gen(fake_monet, training=True)
fake_photo = self.p_gen(real_monet, training=True)
cycled_monet = self.m_gen(fake_photo, training=True)
same_monet = self.m_gen(real_monet, training=True)
same_photo = self.p_gen(real_photo, training=True)
both_monet = tf.concat([real_monet, fake_monet], axis=0)
aug_monet = aug_fn(both_monet)
aug_real_monet = aug_monet[:batch_size]
aug_fake_monet = aug_monet[batch_size:]
disc_real_monet = self.m_disc(aug_real_monet, training=True) # aug_real_monet
disc_real_photo = self.p_disc(real_photo, training=True)
disc_fake_monet = self.m_disc(aug_fake_monet, training=True) # aug_fake_monet
disc_fake_photo = self.p_disc(fake_photo, training=True)
monet_gen_loss = self.gen_loss_fn(disc_fake_monet)
photo_gen_loss = self.gen_loss_fn(disc_fake_photo)
total_cycle_loss = self.cycle_loss_fn(real_monet, cycled_monet, self.lambda_cycle) + self.cycle_loss_fn(real_photo, cycled_photo, self.lambda_cycle)
total_monet_gen_loss = monet_gen_loss + total_cycle_loss + self.identity_loss_fn(real_monet, same_monet, self.lambda_cycle)
total_photo_gen_loss = photo_gen_loss + total_cycle_loss + self.identity_loss_fn(real_photo, same_photo, self.lambda_cycle)
monet_disc_loss = self.disc_loss_fn(disc_real_monet, disc_fake_monet)
photo_disc_loss = self.disc_loss_fn(disc_real_photo, disc_fake_photo)
monet_generator_gradients = tape.gradient(total_monet_gen_loss,
self.m_gen.trainable_variables)
photo_generator_gradients = tape.gradient(total_photo_gen_loss,
self.p_gen.trainable_variables)
monet_discriminator_gradients = tape.gradient(monet_disc_loss,
self.m_disc.trainable_variables)
photo_discriminator_gradients = tape.gradient(photo_disc_loss,
self.p_disc.trainable_variables)
self.m_gen_optimizer.apply_gradients(zip(monet_generator_gradients,
self.m_gen.trainable_variables))
self.p_gen_optimizer.apply_gradients(zip(photo_generator_gradients,
self.p_gen.trainable_variables))
self.m_disc_optimizer.apply_gradients(zip(monet_discriminator_gradients,
self.m_disc.trainable_variables))
self.p_disc_optimizer.apply_gradients(zip(photo_discriminator_gradients,
self.p_disc.trainable_variables))
return {
"monet_gen_loss": total_monet_gen_loss,
"photo_gen_loss": total_photo_gen_loss,
"monet_disc_loss": monet_disc_loss,
"photo_disc_loss": photo_disc_loss
}
定义四种损失并开始训练
with strategy.scope():
def discriminator_loss(real, generated):
real_loss = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE)(tf.ones_like(real), real)
generated_loss = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE)(tf.zeros_like(generated), generated)
total_disc_loss = real_loss + generated_loss
return total_disc_loss * 0.5
def generator_loss(generated):
return tf.keras.losses.BinaryCrossentropy(from_logits=True,
reduction=tf.keras.losses.Reduction.NONE)(tf.ones_like(generated), generated)
def calc_cycle_loss(real_image, cycled_image, LAMBDA):
loss1 = tf.reduce_mean(tf.abs(real_image - cycled_image))
return LAMBDA * loss1
def identity_loss(real_image, same_image, LAMBDA):
loss = tf.reduce_mean(tf.abs(real_image - same_image))
return LAMBDA * 0.5 * loss
monet_generator_optimizer = tf.keras.optimizers.Adam(1e-4, beta_1=0.5)
photo_generator_optimizer = tf.keras.optimizers.Adam(1e-4, beta_1=0.5)
monet_discriminator_optimizer = tf.keras.optimizers.Adam(1e-4, beta_1=0.5)
photo_discriminator_optimizer = tf.keras.optimizers.Adam(1e-4, beta_1=0.5)
cycle_gan_model = CycleGan(
monet_generator, photo_generator, monet_discriminator, photo_discriminator)
cycle_gan_model.compile(
m_gen_optimizer = monet_generator_optimizer,
p_gen_optimizer = photo_generator_optimizer,
m_disc_optimizer = monet_discriminator_optimizer,
p_disc_optimizer = photo_discriminator_optimizer,
gen_loss_fn = generator_loss,
disc_loss_fn = discriminator_loss,
cycle_loss_fn = calc_cycle_loss,
identity_loss_fn = identity_loss
)
cycle_gan_model.fit(full_dataset,epochs=4,steps_per_epoch=(max(n_monet_samples, n_photo_samples)//4),)
提交notebook
! mkdir ../images
i = 1
for photo in photo_ds:
with strategy.scope():
photo_to_monet= monet_generator(photo, training=False)[0].numpy()
photo_to_monet = (photo_to_monet * 127.5 + 127.5).astype(np.uint8)
im = PIL.Image.fromarray(photo_to_monet)
im.save("../images/" + str(i) + ".jpg")
i += 1
shutil.make_archive("/kaggle/working/images", 'zip', "/kaggle/images")
用普通数据增强成绩如下
之后换成了可微数据增强,也没什么提升
大概运行一个小时吧
写的比较好的应该下面这个大佬,但运行他的代码结果也就50
https://www.kaggle.com/code/unfriendlyai/diffaugment-is-all-you-need
其实还有很多继续提高的,但我懒得看论文不想弄了
这是用CLIPTraVeLGAN的