tensorflow 自编码器

版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://blog.csdn.net/u012526003/article/details/79221380

Autoencoders

  • Autoencoder可以对输入数据进行无监督的学习,coding得到的结果的维度往往很低,因此自编码器可以用于数据的降维;自编码器也可以用于特征检测;同时也可以生成许多与训练数据相似的新数据,这可以被称为生成模型(generative model)。

setup code

# 不显示python使用过程中的警告
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import os


def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)
    return

# with tf.Session( config=tf.ConfigProto(gpu_options=gpu_options) ) as sess:
with tf.Session(  ) as sess:
    print( sess.run( tf.constant(1) ) )
1

数据表示形式

  • 实际生活中,许多数据都是有很强的相关性,我们可以对其处理、降维等
  • 自编码器与多层感知器十分类似,唯一的区别就是:自编码器的输出与输入的神经元节点个数是相同的;encoder-decoder串联的话其实就相当于一个重建的过程,首先对输入进行处理,然后对其进行复原。
  • 自编码器是对数据进行降维了,因此它是不完全的,在对数据进行表征时,它选择最重要的特征并记忆,丢弃一些不重要的特征

使用PCA对数据进行不完全的表示

  • 如果对于autoencoder来说,激活函数是线性的,同时loss是MSE,则autoencoder中的encoder等效于PCA。
import numpy.random as rnd
from sklearn.preprocessing import StandardScaler

rnd.seed(4)
m = 200
w1, w2 = 0.1, 0.3
noise = 0.1

angles = rnd.rand(m) * 3 * np.pi / 2 - 0.5
data = np.empty((m, 3))
data[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * rnd.randn(m) / 2
data[:, 1] = np.sin(angles) * 0.7 + noise * rnd.randn(m) / 2
data[:, 2] = data[:, 0] * w1 + data[:, 1] * w2 + noise * rnd.randn(m)

scaler = StandardScaler()
X_train = scaler.fit_transform(data[:100])
X_test = scaler.transform(data[100:])
n_inputs = 3
n_hidden = 2
n_outputs = n_inputs # 输出节点与输入节点个数相同,这也是autoencoder的特点

learning_rate = 0.01

X = tf.placeholder( tf.float32, shape=[None, n_inputs] )
hidden = tf.layers.dense( X, n_hidden, activation=None )
outputs = tf.layers.dense( hidden, n_outputs, activation=None )

reconstruction_loss = tf.reduce_mean( tf.square( outputs - X ) )

optimizer = tf.train.AdamOptimizer( learning_rate )
training_op = optimizer.minimize( reconstruction_loss )

init = tf.global_variables_initializer()

n_iterations = 1000
codings = hidden

with tf.Session() as sess:
    init.run()
    for iteration in range(n_iterations):
        training_op.run(feed_dict={X: X_train})
    codings_val = codings.eval(feed_dict={X: X_test}) # autoencoder的输出,之前的输出是为了计算loss

plt.figure(figsize=(4,3))
plt.plot(codings_val[:,0], codings_val[:, 1], "b.")
plt.xlabel("$z_1$", fontsize=18)
plt.ylabel("$z_2$", fontsize=18, rotation=0)
plt.show()

这里写图片描述

# 画图
def plot_image(image, shape=[28, 28]):
    plt.imshow(image.reshape(shape), cmap="Greys", interpolation="nearest")
    plt.axis("off")
    return

def plot_multiple_images(images, n_rows, n_cols, pad=2):
    images = images - images.min()  # make the minimum == 0, so the padding looks white
    w,h = images.shape[1:]
    image = np.zeros(((w+pad)*n_rows+pad, (h+pad)*n_cols+pad))
    for y in range(n_rows):
        for x in range(n_cols):
            image[(y*(h+pad)+pad):(y*(h+pad)+pad+h),(x*(w+pad)+pad):(x*(w+pad)+pad+w)] = images[y*n_cols+x]
    plt.imshow(image, cmap="Greys", interpolation="nearest")
    plt.axis("off")
    return

栈式自编码器(Stacked Autoencoders)

  • 如果autoencoder含有很多层隐含层,我可以称为栈式自编码器
  • 栈式自编码器一般都是有对称的结构(相对于自编码器的coding结果对称)
from functools import partial
import sys
from tensorflow.examples.tutorials.mnist import input_data

reset_graph()

mnist = input_data.read_data_sets("./dataset/mnist/")

n_inputs = 28*28
n_hidden1 = 300
n_hidden2 = 150
n_hidden3 = n_hidden1
n_outputs = n_inputs

learning_rate = 0.01
l2_reg = 0.0001

X = tf.placeholder(  tf.float32, shape=[None, n_inputs])

he_init = tf.contrib.layers.variance_scaling_initializer()
l2_regularizer = tf.contrib.layers.l2_regularizer( l2_reg )

# 将参数预统一编写,因为所有FC层都是一样的
my_dense_layer = partial( tf.layers.dense, activation=tf.nn.relu, kernel_initializer=he_init, kernel_regularizer=l2_regularizer )

hidden1 = my_dense_layer( X, n_hidden1 )
hidden2 = my_dense_layer( hidden1, n_hidden2 )
hidden3 = my_dense_layer( hidden2, n_hidden3 )
outputs = my_dense_layer( hidden3, n_outputs )

reconstruction_loss = tf.reduce_mean( tf.square( outputs - X ) )

# 得到所有FC层的权重信息,返回的是一个列表
reg_losses = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES )
# 将所有loss合并为一个列表,然后求和,即最终的优化目标
loss = tf.add_n([reconstruction_loss] + reg_losses)

optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()
saver = tf.train.Saver() 

n_epochs = 10
batch_size = 500

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        n_batches = mnist.train.num_examples // batch_size
        for iteration in range(n_batches):
            print("\r{}%".format(100 * iteration // n_batches), end="") 
            sys.stdout.flush()                                       
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run(training_op, feed_dict={X: X_batch})
        loss_train = reconstruction_loss.eval(feed_dict={X: X_batch})
        print("\r{}".format(epoch), "Train MSE:", loss_train)
        saver.save(sess, "./models/ae/stack_ae.ckpt")
Extracting ./dataset/mnist/train-images-idx3-ubyte.gz
Extracting ./dataset/mnist/train-labels-idx1-ubyte.gz
Extracting ./dataset/mnist/t10k-images-idx3-ubyte.gz
Extracting ./dataset/mnist/t10k-labels-idx1-ubyte.gz
0 Train MSE: 0.043224763
1 Train MSE: 0.042281948
2 Train MSE: 0.041092556
3 Train MSE: 0.039506104
4 Train MSE: 0.04099624
5 Train MSE: 0.04035872
6 Train MSE: 0.039791297
7 Train MSE: 0.040091157
8 Train MSE: 0.042521376
9 Train MSE: 0.0400601
  • 注:在使用tf.add_n()时,一直遇到:TypeError: 'list' object is not callable的问题,估计是一开始写代码的时候写了一个add_n变量,将add_n变量覆盖了,重启了一下救可以了
# add_n测试
reset_graph()
a = tf.constant(1)
b = tf.constant(2)
c = tf.add_n( [a,b] )
with tf.Session() as sess:
    print( sess.run( c ) )
3

绑定权重

  • 如果自编码器被设计为对称的结构,假设总共有L层(不包括输入层),第L层的权重是WL,第1层为隐含层,第L为输出层,第L/2层是coding层,可以设计为WNL+1=WLT的权重方式
def show_reconstructed_digits(X, outputs, model_path = None, n_test_digits = 2):
    with tf.Session() as sess:
        if model_path:
            saver.restore(sess, model_path)
        X_test = mnist.test.images[:n_test_digits]
        outputs_val = outputs.eval(feed_dict={X: X_test})

    fig = plt.figure(figsize=(8, 3 * n_test_digits))
    for digit_index in range(n_test_digits):
        plt.subplot(n_test_digits, 2, digit_index * 2 + 1)
        plot_image(X_test[digit_index])
        plt.subplot(n_test_digits, 2, digit_index * 2 + 2)
        plot_image(outputs_val[digit_index])
show_reconstructed_digits(X, outputs, "./models/ae/stack_ae.ckpt")
plt.show()
INFO:tensorflow:Restoring parameters from ./models/ae/stack_ae.ckpt

这里写图片描述

  • 因为我们使用转置的方法获取对称的隐含层的权重,因此无法使用tf.layers.dense,需要自己实现网络层的计算方式(线性计算+激活函数)
reset_graph()

mnist = input_data.read_data_sets("./dataset/mnist/")

n_inputs = 28*28
n_hidden1 = 300
n_hidden2 = 150
n_hidden3 = n_hidden1
n_outputs = n_inputs

learning_rate = 0.01
l2_reg = 0.0005

activation = tf.nn.relu
regularizer = tf.contrib.layers.l2_regularizer( l2_reg )
initializer = tf.contrib.layers.variance_scaling_initializer()

X = tf.placeholder( tf.float32, shape=[None, n_inputs] )

weights1_init = initializer( [n_inputs, n_hidden1] )
weights2_init = initializer( [n_hidden1, n_hidden2] )

weights1 = tf.Variable(weights1_init, dtype=tf.float32, name="weights1")
weights2 = tf.Variable(weights2_init, dtype=tf.float32, name="weights2")
weights3 = tf.transpose(weights2, name="weights3")  # tied weights
weights4 = tf.transpose(weights1, name="weights4")  # tied weights

biases1 = tf.Variable(tf.zeros(n_hidden1), name="biases1")
biases2 = tf.Variable(tf.zeros(n_hidden2), name="biases2")
biases3 = tf.Variable(tf.zeros(n_hidden3), name="biases3")
biases4 = tf.Variable(tf.zeros(n_outputs), name="biases4")

hidden1 = activation(tf.matmul(X, weights1) + biases1)
hidden2 = activation(tf.matmul(hidden1, weights2) + biases2)
hidden3 = activation(tf.matmul(hidden2, weights3) + biases3)
outputs = tf.matmul(hidden3, weights4) + biases4

reconstruction_loss = tf.reduce_mean(tf.square(outputs - X))
reg_loss = regularizer(weights1) + regularizer(weights2)
loss = reconstruction_loss + reg_loss

optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()

saver = tf.train.Saver()

n_epochs = 10
batch_size = 150

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        n_batches = mnist.train.num_examples // batch_size
        for iteration in range(n_batches):
            print("\r{}%".format(100 * iteration // n_batches), end="")
            sys.stdout.flush()
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run(training_op, feed_dict={X: X_batch})
        loss_train = reconstruction_loss.eval(feed_dict={X: X_batch})
        print("\r{}".format(epoch), "Train MSE:", loss_train)
        saver.save(sess, "./models/ae/stack_ae_tying_weights.ckpt")
Extracting ./dataset/mnist/train-images-idx3-ubyte.gz
Extracting ./dataset/mnist/train-labels-idx1-ubyte.gz
Extracting ./dataset/mnist/t10k-images-idx3-ubyte.gz
Extracting ./dataset/mnist/t10k-labels-idx1-ubyte.gz
0 Train MSE: 0.028156107
1 Train MSE: 0.0266964
2 Train MSE: 0.027346307
3 Train MSE: 0.026245965
4 Train MSE: 0.025696924
5 Train MSE: 0.027034879
6 Train MSE: 0.026534757
7 Train MSE: 0.02712253
8 Train MSE: 0.031350538
9 Train MSE: 0.030764775
show_reconstructed_digits(X, outputs, "./models/ae/stack_ae_tying_weights.ckpt")
plt.show()
INFO:tensorflow:Restoring parameters from ./models/ae/stack_ae_tying_weights.ckpt

这里写图片描述

  • 之前的一次训练一个层数很多的autoencoder的速度十分慢,一次只训练一个浅层的autoencoder会快很多,然后将这些训练好的autoencoder整合为一个autoencoder,相当于一层一层构建autoencoder,首先利用autoencoder重构得到第一个隐含层的输出,然后将这个输出作为第二个autoencoder的输入,继续训练,以此类推。
  • 最简单的封装这种多相训练算法的方法就是在不同的graph中训练不同的autoencoder,之后得到所有的weights与bias,利用这些变量就可以构建得到最终的autoencoder
reset_graph()

def train_autoencoder(X_train, n_neurons, n_epochs, batch_size,
                      learning_rate = 0.01, l2_reg = 0.0005,
                      activation=tf.nn.elu, seed=42):
    graph = tf.Graph()
    with graph.as_default():
        tf.set_random_seed(seed)

        n_inputs = X_train.shape[1]

        X = tf.placeholder(tf.float32, shape=[None, n_inputs])

        my_dense_layer = partial(
            tf.layers.dense,
            activation=activation,
            kernel_initializer=tf.contrib.layers.variance_scaling_initializer(),
            kernel_regularizer=tf.contrib.layers.l2_regularizer(l2_reg))

        hidden = my_dense_layer(X, n_neurons, name="hidden")
        outputs = my_dense_layer(hidden, n_inputs, activation=None, name="outputs")

        reconstruction_loss = tf.reduce_mean(tf.square(outputs - X))

        reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
        loss = tf.add_n([reconstruction_loss] + reg_losses)

        optimizer = tf.train.AdamOptimizer(learning_rate)
        training_op = optimizer.minimize(loss)

        init = tf.global_variables_initializer()

    with tf.Session(graph=graph) as sess:
        init.run()
        for epoch in range(n_epochs):
            n_batches = len(X_train) // batch_size
            for iteration in range(n_batches):
                print("\r{}%".format(100 * iteration // n_batches), end="")
                sys.stdout.flush()
                indices = np.random.permutation(len(X_train))[:batch_size]
                X_batch = X_train[indices]
                sess.run(training_op, feed_dict={X: X_batch})
            loss_train = reconstruction_loss.eval(feed_dict={X: X_batch})
            print("\r{}".format(epoch), "Train MSE:", loss_train)
        params = dict([(var.name, var.eval()) for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)])
        hidden_val = hidden.eval(feed_dict={X: X_train})
        return hidden_val, params["hidden/kernel:0"], params["hidden/bias:0"], params["outputs/kernel:0"], params["outputs/bias:0"]
hidden_output, W1, b1, W4, b4 = train_autoencoder(mnist.train.images, n_neurons=300, n_epochs=4, batch_size=150)
_, W2, b2, W3, b3 = train_autoencoder(hidden_output, n_neurons=150, n_epochs=4, batch_size=150)
0 Train MSE: 0.018122246
1 Train MSE: 0.018951437
2 Train MSE: 0.019684358
3 Train MSE: 0.019918667
0 Train MSE: 0.004202352
1 Train MSE: 0.0042918506
2 Train MSE: 0.0044221305
3 Train MSE: 0.004516779
# 利用上面训练得到的参数构建最终的自编码器
reset_graph()

n_inputs = 28*28

X = tf.placeholder(tf.float32, shape=[None, n_inputs])
hidden1 = tf.nn.elu(tf.matmul(X, W1) + b1)
hidden2 = tf.nn.elu(tf.matmul(hidden1, W2) + b2)
hidden3 = tf.nn.elu(tf.matmul(hidden2, W3) + b3)
outputs = tf.matmul(hidden3, W4) + b4

# 不从文件中加载网络数据
show_reconstructed_digits(X, outputs)
plt.show()

这里写图片描述

  • 上面是在不同的graph中构建了不同的autoencdoer,也可以在同一个graph中构建,即构建2个loss,首先训练第一个隐含层的参数,然后再训练第二个隐含层的参数
  • 在训练不同的autoencoder时,一个能加快训练速度的方法是:固定一开始得到的所有的训练参数,计算出经过这一层的输出,然后将其作为输入,用于训练下一个autoencoder,相当于修改feed_dict

对特征进行可视化

  • 因为自编码器可以学习特征,因此我们可以对特征进行可视化
# 在一个graph中训练多个autoencoder(按顺序训练)
reset_graph()

n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 150  # codings
n_hidden3 = n_hidden1
n_outputs = n_inputs

learning_rate = 0.01
l2_reg = 0.0001

activation = tf.nn.elu
regularizer = tf.contrib.layers.l2_regularizer(l2_reg)
initializer = tf.contrib.layers.variance_scaling_initializer()

X = tf.placeholder(tf.float32, shape=[None, n_inputs])

weights1_init = initializer([n_inputs, n_hidden1])
weights2_init = initializer([n_hidden1, n_hidden2])
weights3_init = initializer([n_hidden2, n_hidden3])
weights4_init = initializer([n_hidden3, n_outputs])

weights1 = tf.Variable(weights1_init, dtype=tf.float32, name="weights1")
weights2 = tf.Variable(weights2_init, dtype=tf.float32, name="weights2")
weights3 = tf.Variable(weights3_init, dtype=tf.float32, name="weights3")
weights4 = tf.Variable(weights4_init, dtype=tf.float32, name="weights4")

biases1 = tf.Variable(tf.zeros(n_hidden1), name="biases1")
biases2 = tf.Variable(tf.zeros(n_hidden2), name="biases2")
biases3 = tf.Variable(tf.zeros(n_hidden3), name="biases3")
biases4 = tf.Variable(tf.zeros(n_outputs), name="biases4")

hidden1 = activation(tf.matmul(X, weights1) + biases1)
hidden2 = activation(tf.matmul(hidden1, weights2) + biases2)
hidden3 = activation(tf.matmul(hidden2, weights3) + biases3)
outputs = tf.matmul(hidden3, weights4) + biases4

reconstruction_loss = tf.reduce_mean(tf.square(outputs - X))

optimizer = tf.train.AdamOptimizer(learning_rate)

with tf.name_scope("phase1"):
    phase1_outputs = tf.matmul(hidden1, weights4) + biases4  # bypass hidden2 and hidden3
    phase1_reconstruction_loss = tf.reduce_mean(tf.square(phase1_outputs - X))
    phase1_reg_loss = regularizer(weights1) + regularizer(weights4)
    phase1_loss = phase1_reconstruction_loss + phase1_reg_loss
    phase1_training_op = optimizer.minimize(phase1_loss)

with tf.name_scope("phase2"):
    phase2_reconstruction_loss = tf.reduce_mean(tf.square(hidden3 - hidden1))
    phase2_reg_loss = regularizer(weights2) + regularizer(weights3)
    phase2_loss = phase2_reconstruction_loss + phase2_reg_loss
    train_vars = [weights2, biases2, weights3, biases3]
    phase2_training_op = optimizer.minimize(phase2_loss, var_list=train_vars) # freeze hidden1

init = tf.global_variables_initializer()
saver = tf.train.Saver()

training_ops = [phase1_training_op, phase2_training_op]
reconstruction_losses = [phase1_reconstruction_loss, phase2_reconstruction_loss]
n_epochs = [4, 4]
batch_sizes = [150, 150]

with tf.Session() as sess:
    init.run()
    for phase in range(2):
        print("Training phase #{}".format(phase + 1))
        for epoch in range(n_epochs[phase]):
            n_batches = mnist.train.num_examples // batch_sizes[phase]
            for iteration in range(n_batches):
                print("\r{}%".format(100 * iteration // n_batches), end="")
                sys.stdout.flush()
                X_batch, y_batch = mnist.train.next_batch(batch_sizes[phase])
                sess.run(training_ops[phase], feed_dict={X: X_batch})
            loss_train = reconstruction_losses[phase].eval(feed_dict={X: X_batch})
            print("\r{}".format(epoch), "Train MSE:", loss_train)
            saver.save(sess, "./models/ae/stack_ae_one_at_a_time.ckpt")
    loss_test = reconstruction_loss.eval(feed_dict={X: mnist.test.images})
    print("Test MSE:", loss_test)
Training phase #1
0 Train MSE: 0.0075954874
1 Train MSE: 0.0076178126
2 Train MSE: 0.0075386846
3 Train MSE: 0.007713743
Training phase #2
0 Train MSE: 0.32253775
1 Train MSE: 0.008439677
2 Train MSE: 0.0027948823
3 Train MSE: 0.0022397852
Test MSE: 0.009754321
with tf.Session() as sess:
    saver.restore(sess, "./models/ae/stack_ae_one_at_a_time.ckpt") # not shown in the book
    weights1_val = weights1.eval()

for i in range(5):
    plt.subplot(1, 5, i + 1)
    plot_image(weights1_val.T[i])

plt.show()                          # not shown
INFO:tensorflow:Restoring parameters from ./models/ae/stack_ae_one_at_a_time.ckpt

这里写图片描述

使用自编码器进行非监督的预训练

  • 如果labelded数据很少,那么训练的过程中,模型比较容易过拟合(在参数较多的情况下),我们可以找到一个类似的任务,然后拷贝他的低层的训练参数到该任务中
  • 此外,我们也可以用自编码器对ublabeled数据进行训练,然后将浅层的隐含层参数拷贝到labeled数据中,相当于预处理,这也可以减少训练的参数量
  • 在之前的自编码器中,隐含层节点的个数小于输入层节点的个数,这会使得数据的信息由部分缺失,如果隐含层节点个数大于输入层节点个数,那学习到的自编码器是过完全的自编码器(Denoising Autoencoders)

降噪自编码器(Denoising Autoencoders)

  • 可以使用自编码器进行降噪,可以通过对输入加入高斯噪声或者对输入使用dropout来引入噪声
# 添加高斯噪声
reset_graph()

n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 150  # codings
n_hidden3 = n_hidden1
n_outputs = n_inputs

learning_rate = 0.01

noise_level = 1.0

X = tf.placeholder(tf.float32, shape=[None, n_inputs])
X_noisy = X + noise_level * tf.random_normal(tf.shape(X))

hidden1 = tf.layers.dense(X_noisy, n_hidden1, activation=tf.nn.relu,
                          name="hidden1")
hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, # not shown in the book
                          name="hidden2")                            # not shown
hidden3 = tf.layers.dense(hidden2, n_hidden3, activation=tf.nn.relu, # not shown
                          name="hidden3")                            # not shown
outputs = tf.layers.dense(hidden3, n_outputs, name="outputs")        # not shown

reconstruction_loss = tf.reduce_mean(tf.square(outputs - X)) # MSE

optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(reconstruction_loss)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

n_epochs = 10
batch_size = 150

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        n_batches = mnist.train.num_examples // batch_size
        for iteration in range(n_batches):
            print("\r{}%".format(100 * iteration // n_batches), end="")
            sys.stdout.flush()
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run(training_op, feed_dict={X: X_batch})
        loss_train = reconstruction_loss.eval(feed_dict={X: X_batch})
        print("\r{}".format(epoch), "Train MSE:", loss_train)
        saver.save(sess, "./models/ae/stacked_denoising_gaussian.ckpt")
0 Train MSE: 0.04392284
1 Train MSE: 0.04212565
2 Train MSE: 0.04013202
3 Train MSE: 0.042316306
4 Train MSE: 0.04260728
5 Train MSE: 0.039247368
6 Train MSE: 0.040977154
7 Train MSE: 0.040510636
8 Train MSE: 0.039376777
9 Train MSE: 0.03940287
# 添加高斯噪声
reset_graph()

n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 150  # codings
n_hidden3 = n_hidden1
n_outputs = n_inputs

learning_rate = 0.01
dropout_rate = 0.3

training = tf.placeholder_with_default( False, shape=(), name="training" )
X = tf.placeholder(tf.float32, shape=[None, n_inputs])
X_drop = tf.layers.dropout( X, dropout_rate, training=training )

hidden1 = tf.layers.dense(X_drop, n_hidden1, activation=tf.nn.relu,
                          name="hidden1")
hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, # not shown in the book
                          name="hidden2")                            # not shown
hidden3 = tf.layers.dense(hidden2, n_hidden3, activation=tf.nn.relu, # not shown
                          name="hidden3")                            # not shown
outputs = tf.layers.dense(hidden3, n_outputs, name="outputs")        # not shown

reconstruction_loss = tf.reduce_mean(tf.square(outputs - X)) # MSE

optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(reconstruction_loss)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

n_epochs = 10
batch_size = 150

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        n_batches = mnist.train.num_examples // batch_size
        for iteration in range(n_batches):
            print("\r{}%".format(100 * iteration // n_batches), end="")
            sys.stdout.flush()
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run(training_op, feed_dict={X: X_batch})
        loss_train = reconstruction_loss.eval(feed_dict={X: X_batch})
        print("\r{}".format(epoch), "Train MSE:", loss_train)
        saver.save(sess, "./models/ae/stacked_denoising_dropout.ckpt")
0 Train MSE: 0.031744514
1 Train MSE: 0.02759777
2 Train MSE: 0.027245231
3 Train MSE: 0.026180187
4 Train MSE: 0.027719578
5 Train MSE: 0.027861215
6 Train MSE: 0.024314487
7 Train MSE: 0.026936982
8 Train MSE: 0.024768423
9 Train MSE: 0.027037865
show_reconstructed_digits(X, outputs, "./models/ae/stacked_denoising_dropout.ckpt")
plt.show()
INFO:tensorflow:Restoring parameters from ./models/ae/stacked_denoising_dropout.ckpt

这里写图片描述

稀疏自编码器(Sparse Autoencoders)

  • 使得神经元大部分节点被限制的自编码器称为稀疏自编码器。我们可以在代价函数中加入稀疏惩罚,如果目标的稀疏率为0.1,但是所有神经元的平均激活率为0.3,就需要对当前的参数进行处理,减少激活的神经元个数。
  • 一般稀疏惩罚可以用KL距离进行度量(Kullback-Leibler Divergence),给定两个离散的概率分布PQ,KL距离的计算方法为
    DKL(P||Q)=iP(i)logP(i)Q(i)

    假设在coding layer中,神经元被激活的概率为p,实际的激活概率(训练中计算出来)为q,则KL距离为
    DKL(p||q)=plogpq+(1p)log1p1q

    coding layer中所有神经元的稀疏损失相加,在加到之前的代价函数中,我们就得到最终的代价函数。
  • 可以引入一个超参数,用于限制我们稀疏损失在代价函数中所占的比重
  • 在这里需要注意的是:因为coding layer的值需要在(0,1),因此采用了sigmoid的激活函数
# 对KL距离与MSE进行可视化
p = 0.1
q = np.linspace(0.001, 0.999, 500)
kl_div = p * np.log(p / q) + (1 - p) * np.log((1 - p) / (1 - q))
mse = (p - q)**2
plt.plot([p, p], [0, 0.3], "k:")
plt.text(0.05, 0.32, "Target\nsparsity", fontsize=14)
plt.plot(q, kl_div, "b-", label="KL divergence")
plt.plot(q, mse, "r--", label="MSE")
plt.legend(loc="upper left")
plt.xlabel("Actual sparsity")
plt.ylabel("Cost", rotation=0)
plt.axis([0, 1, 0, 0.95])
plt.show()

这里写图片描述

reset_graph()

n_inputs = 28 * 28
n_hidden1 = 1000  # sparse codings
n_outputs = n_inputs

def kl_divergence(p, q):
    # Kullback Leibler divergence
    return p * tf.log(p / q) + (1 - p) * tf.log((1 - p) / (1 - q))

learning_rate = 0.01
sparsity_target = 0.1
sparsity_weight = 0.2

X = tf.placeholder(tf.float32, shape=[None, n_inputs])

hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.sigmoid)
outputs = tf.layers.dense(hidden1, n_outputs)

hidden1_mean = tf.reduce_mean(hidden1, axis=0) # batch mean
sparsity_loss = tf.reduce_sum(kl_divergence(sparsity_target, hidden1_mean))
reconstruction_loss = tf.reduce_mean(tf.square(outputs - X)) # MSE
loss = reconstruction_loss + sparsity_weight * sparsity_loss

optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

n_epochs = 50
batch_size = 1000

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        n_batches = mnist.train.num_examples // batch_size
        for iteration in range(n_batches):
            print("\r{}%".format(100 * iteration // n_batches), end="")
            sys.stdout.flush()
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run(training_op, feed_dict={X: X_batch})
        reconstruction_loss_val, sparsity_loss_val, loss_val = sess.run([reconstruction_loss, sparsity_loss, loss], feed_dict={X: X_batch})
        print("\r{}".format(epoch), "Train MSE:", reconstruction_loss_val, "\tSparsity loss:", sparsity_loss_val, "\tTotal loss:", loss_val)
        saver.save(sess, "./models/ae/model_sparse.ckpt")
0 Train MSE: 0.1363607  Sparsity loss: 0.45424536   Total loss: 0.22720978
1 Train MSE: 0.058487475    Sparsity loss: 0.0065721134     Total loss: 0.0598019
2 Train MSE: 0.052577537    Sparsity loss: 0.02152061   Total loss: 0.05688166
3 Train MSE: 0.047282524    Sparsity loss: 0.025944868  Total loss: 0.052471496
4 Train MSE: 0.043635875    Sparsity loss: 0.036555905  Total loss: 0.050947055
5 Train MSE: 0.040454544    Sparsity loss: 0.013679091  Total loss: 0.043190364
6 Train MSE: 0.039003808    Sparsity loss: 0.0288186    Total loss: 0.04476753
7 Train MSE: 0.03634573     Sparsity loss: 0.053503223  Total loss: 0.04704638
8 Train MSE: 0.032519776    Sparsity loss: 0.014649414  Total loss: 0.035449658
9 Train MSE: 0.029499026    Sparsity loss: 0.04859985   Total loss: 0.039218996
10 Train MSE: 0.02671413    Sparsity loss: 0.05883885   Total loss: 0.0384819
11 Train MSE: 0.024794476   Sparsity loss: 0.105040595  Total loss: 0.045802593
12 Train MSE: 0.023290204   Sparsity loss: 0.107862495  Total loss: 0.044862702
13 Train MSE: 0.022106858   Sparsity loss: 0.022782959  Total loss: 0.02666345
14 Train MSE: 0.020763604   Sparsity loss: 0.038643688  Total loss: 0.028492343
15 Train MSE: 0.020432802   Sparsity loss: 0.06936067   Total loss: 0.034304935
16 Train MSE: 0.019542685   Sparsity loss: 0.17338103   Total loss: 0.05421889
17 Train MSE: 0.017766118   Sparsity loss: 0.15682302   Total loss: 0.049130723
18 Train MSE: 0.017933922   Sparsity loss: 0.08688893   Total loss: 0.035311706
19 Train MSE: 0.01876632    Sparsity loss: 0.19676635   Total loss: 0.058119588
20 Train MSE: 0.01672198    Sparsity loss: 0.056824684  Total loss: 0.028086917
21 Train MSE: 0.016808983   Sparsity loss: 0.032306433  Total loss: 0.02327027
22 Train MSE: 0.016985897   Sparsity loss: 0.1484245    Total loss: 0.0466708
23 Train MSE: 0.01533399    Sparsity loss: 0.15314564   Total loss: 0.045963116
24 Train MSE: 0.015439643   Sparsity loss: 0.08969944   Total loss: 0.033379532
25 Train MSE: 0.0150727425  Sparsity loss: 0.0977269    Total loss: 0.034618124
26 Train MSE: 0.016957765   Sparsity loss: 0.21695115   Total loss: 0.060347997
27 Train MSE: 0.014440946   Sparsity loss: 0.04255914   Total loss: 0.022952775
28 Train MSE: 0.015100989   Sparsity loss: 0.08643228   Total loss: 0.032387443
29 Train MSE: 0.013699243   Sparsity loss: 0.29950532   Total loss: 0.07360031
30 Train MSE: 0.014233368   Sparsity loss: 0.09912021   Total loss: 0.03405741
31 Train MSE: 0.015360349   Sparsity loss: 0.15585451   Total loss: 0.046531253
32 Train MSE: 0.012930505   Sparsity loss: 0.07084398   Total loss: 0.0270993
33 Train MSE: 0.0151651725  Sparsity loss: 0.39751768   Total loss: 0.09466871
34 Train MSE: 0.013862544   Sparsity loss: 0.14919572   Total loss: 0.043701686
35 Train MSE: 0.012845697   Sparsity loss: 0.06789903   Total loss: 0.026425503
36 Train MSE: 0.013587774   Sparsity loss: 0.05884434   Total loss: 0.025356643
37 Train MSE: 0.015991984   Sparsity loss: 0.23719792   Total loss: 0.06343157
38 Train MSE: 0.012888592   Sparsity loss: 0.086833194  Total loss: 0.03025523
39 Train MSE: 0.012454429   Sparsity loss: 0.07360138   Total loss: 0.027174704
40 Train MSE: 0.012573832   Sparsity loss: 0.17452234   Total loss: 0.0474783
41 Train MSE: 0.011945258   Sparsity loss: 0.08876098   Total loss: 0.029697455
42 Train MSE: 0.012003577   Sparsity loss: 0.04540896   Total loss: 0.02108537
43 Train MSE: 0.01353173    Sparsity loss: 0.12448001   Total loss: 0.038427733
44 Train MSE: 0.012057042   Sparsity loss: 0.07625218   Total loss: 0.027307477
45 Train MSE: 0.011600876   Sparsity loss: 0.05237763   Total loss: 0.022076402
46 Train MSE: 0.012067402   Sparsity loss: 0.08493041   Total loss: 0.029053485
47 Train MSE: 0.011740665   Sparsity loss: 0.041903745  Total loss: 0.020121414
48 Train MSE: 0.011878712   Sparsity loss: 0.16246115   Total loss: 0.04437094
49 Train MSE: 0.013019736   Sparsity loss: 0.2827945    Total loss: 0.06957864

变分自编码器(Variational Autoencoders)

  • 变分自编码器有几个重要特征
    • 变分自编码器是概率自编码器,它们的输出是随机的,而非降噪自编码器那样,在训练时加入随机噪声,而输出是确定的。
    • 大部分的变分子编码器是生成自编码器,它们能够生成与训练集类似的新样本。
  • 上述的这些特征使得他们与RBM(受限玻尔兹曼机)很相似,但是他们的训练难度更小,训练速度更快
  • 变分自编码器的工作过程:将输入视为符合高斯分布,在训练的过程中,使得数据不断向coding space聚集,这个区域类似于一个球形区域,所有的点类似于高斯点云,根据球心和半径可以得到高斯分布的μσ,最终可以利用高斯分布重新生成许多新的样本
reset_graph()

from functools import partial

n_inputs = 28 * 28
n_hidden1 = 500
n_hidden2 = 500
n_hidden3 = 20  # codings
n_hidden4 = n_hidden2
n_hidden5 = n_hidden1
n_outputs = n_inputs
learning_rate = 0.001

initializer = tf.contrib.layers.variance_scaling_initializer()

my_dense_layer = partial(
    tf.layers.dense,
    activation=tf.nn.elu,
    kernel_initializer=initializer)

X = tf.placeholder(tf.float32, [None, n_inputs])
hidden1 = my_dense_layer(X, n_hidden1)
hidden2 = my_dense_layer(hidden1, n_hidden2)
hidden3_mean = my_dense_layer(hidden2, n_hidden3, activation=None)
hidden3_sigma = my_dense_layer(hidden2, n_hidden3, activation=None)
noise = tf.random_normal(tf.shape(hidden3_sigma), dtype=tf.float32)
hidden3 = hidden3_mean + hidden3_sigma * noise
hidden4 = my_dense_layer(hidden3, n_hidden4)
hidden5 = my_dense_layer(hidden4, n_hidden5)
logits = my_dense_layer(hidden5, n_outputs, activation=None)
outputs = tf.sigmoid(logits)

# 熵作为代价函数
xentropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=X, logits=logits)
reconstruction_loss = tf.reduce_sum(xentropy)

eps = 1e-10 # smoothing term to avoid computing log(0) which is NaN
latent_loss = 0.5 * tf.reduce_sum(
    tf.square(hidden3_sigma) + tf.square(hidden3_mean)
    - 1 - tf.log(eps + tf.square(hidden3_sigma)))

loss = reconstruction_loss + latent_loss

optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

n_epochs = 50
batch_size = 500
n_digits = 60

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        n_batches = mnist.train.num_examples // batch_size
        for iteration in range(n_batches):
            print("\r{}%".format(100 * iteration // n_batches), end="")
            sys.stdout.flush()
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run(training_op, feed_dict={X: X_batch})
        loss_val, reconstruction_loss_val, latent_loss_val = sess.run([loss, reconstruction_loss, latent_loss], feed_dict={X: X_batch})
        print("\r{}".format(epoch), "Train total loss:", loss_val, "\tReconstruction loss:", reconstruction_loss_val, "\tLatent loss:", latent_loss_val)
        saver.save(sess, "./models/ae/model_variational.ckpt")
    codings_rnd = np.random.normal(size=[n_digits, n_hidden3])
    outputs_val = outputs.eval(feed_dict={hidden3: codings_rnd})
0 Train total loss: 108858.53   Reconstruction loss: 92765.95   Latent loss: 16092.581
1 Train total loss: 95518.59    Reconstruction loss: 80019.37   Latent loss: 15499.229
2 Train total loss: 94250.984   Reconstruction loss: 76492.32   Latent loss: 17758.668
3 Train total loss: 85775.62    Reconstruction loss: 74041.52   Latent loss: 11734.095
4 Train total loss: 100383.3    Reconstruction loss: 82706.01   Latent loss: 17677.287
5 Train total loss: 83227.99    Reconstruction loss: 72003.82   Latent loss: 11224.172
6 Train total loss: 81381.85    Reconstruction loss: 67594.96   Latent loss: 13786.893
7 Train total loss: 77068.84    Reconstruction loss: 64921.156  Latent loss: 12147.686
8 Train total loss: 73167.36    Reconstruction loss: 62095.137  Latent loss: 11072.225
9 Train total loss: 88950.766   Reconstruction loss: 72938.25   Latent loss: 16012.513
10 Train total loss: 74265.125  Reconstruction loss: 62513.266  Latent loss: 11751.863
11 Train total loss: 64812.906  Reconstruction loss: 53150.906  Latent loss: 11662.002
12 Train total loss: 62642.883  Reconstruction loss: 51022.383  Latent loss: 11620.502
13 Train total loss: 62580.246  Reconstruction loss: 52444.883  Latent loss: 10135.364
14 Train total loss: 58084.96   Reconstruction loss: 47577.715  Latent loss: 10507.246
15 Train total loss: 56837.977  Reconstruction loss: 46601.695  Latent loss: 10236.281
16 Train total loss: 57115.25   Reconstruction loss: 46603.555  Latent loss: 10511.693
17 Train total loss: 55932.973  Reconstruction loss: 45348.086  Latent loss: 10584.887
18 Train total loss: 54929.297  Reconstruction loss: 44535.008  Latent loss: 10394.289
19 Train total loss: 55165.54   Reconstruction loss: 44593.773  Latent loss: 10571.765
20 Train total loss: 54084.86   Reconstruction loss: 43277.656  Latent loss: 10807.201
21 Train total loss: 55043.47   Reconstruction loss: 44159.88   Latent loss: 10883.591
22 Train total loss: 54556.668  Reconstruction loss: 43821.36   Latent loss: 10735.308
23 Train total loss: 55389.6    Reconstruction loss: 44427.105  Latent loss: 10962.494
24 Train total loss: 54632.0    Reconstruction loss: 43592.832  Latent loss: 11039.168
25 Train total loss: 53095.383  Reconstruction loss: 42151.17   Latent loss: 10944.211
26 Train total loss: 53648.316  Reconstruction loss: 42733.97   Latent loss: 10914.347
27 Train total loss: 52447.293  Reconstruction loss: 41648.016  Latent loss: 10799.278
28 Train total loss: 54149.805  Reconstruction loss: 43099.484  Latent loss: 11050.318
29 Train total loss: 53252.58   Reconstruction loss: 41960.66   Latent loss: 11291.918
30 Train total loss: 51194.31   Reconstruction loss: 40089.0    Latent loss: 11105.309
31 Train total loss: 52787.195  Reconstruction loss: 41694.754  Latent loss: 11092.441
32 Train total loss: 53847.363  Reconstruction loss: 42305.344  Latent loss: 11542.02
33 Train total loss: 52628.3    Reconstruction loss: 41171.062  Latent loss: 11457.239
34 Train total loss: 52067.97   Reconstruction loss: 40626.29   Latent loss: 11441.68
35 Train total loss: 52868.15   Reconstruction loss: 41568.508  Latent loss: 11299.641
36 Train total loss: 53024.797  Reconstruction loss: 41649.418  Latent loss: 11375.377
37 Train total loss: 53452.785  Reconstruction loss: 41585.477  Latent loss: 11867.309
38 Train total loss: 59480.195  Reconstruction loss: 47845.863  Latent loss: 11634.334
39 Train total loss: 59978.227  Reconstruction loss: 47661.91   Latent loss: 12316.317
40 Train total loss: 52442.887  Reconstruction loss: 41086.754  Latent loss: 11356.132
41 Train total loss: 52291.797  Reconstruction loss: 40508.383  Latent loss: 11783.412
42 Train total loss: 51086.45   Reconstruction loss: 39521.875  Latent loss: 11564.573
43 Train total loss: 51753.11   Reconstruction loss: 39992.688  Latent loss: 11760.422
44 Train total loss: 52843.297  Reconstruction loss: 41421.945  Latent loss: 11421.353
45 Train total loss: 53560.875  Reconstruction loss: 41686.742  Latent loss: 11874.133
46 Train total loss: 52755.484  Reconstruction loss: 41099.125  Latent loss: 11656.357
47 Train total loss: 51850.03   Reconstruction loss: 40175.383  Latent loss: 11674.65
48 Train total loss: 50693.93   Reconstruction loss: 39233.89   Latent loss: 11460.04
49 Train total loss: 51181.336  Reconstruction loss: 39801.117  Latent loss: 11380.221
plt.figure(figsize=(8,50)) # not shown in the book
for iteration in range(n_digits):
    plt.subplot(n_digits, 10, iteration + 1)
    plot_image(outputs_val[iteration])
plt.show()

这里写图片描述

展开阅读全文

没有更多推荐了,返回首页