RBM受限波尔兹曼机学习
原文地址:
作者:hjimce
[python] view plain copy
- #coding=utf-8
- import timeit
- try:
- import PIL.Image as Image
- except ImportError:
- import Image
- import numpy
- import theano
- import theano.tensor as T
- import os
- from theano.tensor.shared_randomstreams import RandomStreams
- from utils import tile_raster_images
- from logistic_sgd import load_data
- #RBM网络结构设置,主要是参数初始化
- class RBM(object):
- def __init__(self,input=None,n_visible=784,n_hidden=500,W=None,hbias=None,vbias=None,numpy_rng=None,theano_rng=None):
- self.n_visible = n_visible
- self.n_hidden = n_hidden
- if numpy_rng is None:
- numpy_rng = numpy.random.RandomState(1234)
- if theano_rng is None:
- theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))
- #参数初始化公式 -4*sqrt(6./(n_visible+n_hidden))
- if W is None:
- initial_W = numpy.asarray(
- numpy_rng.uniform(
- low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)),
- high=4 * numpy.sqrt(6. / (n_hidden + n_visible)),
- size=(n_visible, n_hidden)
- ),
- dtype=theano.config.floatX
- )
- #在GPU上,多线程权重共享
- W = theano.shared(value=initial_W, name='W', borrow=True)
- #隐藏层偏置b初始化为0
- if hbias is None:
- hbias = theano.shared(value=numpy.zeros(n_hidden,dtype=theano.config.floatX),name='hbias',borrow=True)
- #可见层偏置项b初始化为0
- if vbias is None:
- vbias = theano.shared(value=numpy.zeros(n_visible,dtype=theano.config.floatX),name='vbias',borrow=True)
- #网络输入
- self.input = input
- if not input:
- self.input = T.matrix('input')
- self.W = W
- self.hbias = hbias
- self.vbias = vbias
- self.theano_rng = theano_rng
- #网络的所有参数,我们把它们放在一个列表中,以便后续访问
- self.params = [self.W, self.hbias, self.vbias]
- #能量函数的定义,主要是用于计算可视层的能量
- def free_energy(self, v_sample):
- ''''' Function to compute the free energy '''
- wx_b = T.dot(v_sample, self.W) + self.hbias
- vbias_term = T.dot(v_sample, self.vbias)
- hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=1)
- return -hidden_term - vbias_term
- #前向传导 从可视层到隐藏层,计算p(h=1|v)
- def propup(self, vis):
- pre_sigmoid_activation = T.dot(vis, self.W) + self.hbias
- return [pre_sigmoid_activation, T.nnet.sigmoid(pre_sigmoid_activation)]
- #根据可视层状态 计算隐藏层状态
- def sample_h_given_v(self, v0_sample):
- #计算隐藏层每个神经元为状态1的概率,即 p(h=1|v)
- pre_sigmoid_h1, h1_mean = self.propup(v0_sample)
- #根据给定的概率,进行采样,就像抛硬币一样。n表示抛硬币的次数 ,每个神经元随机采样一次,就可以得到每个神经元的状态了
- #p是生成1的概率,binomial函数生成的是一个0、1数
- h1_sample = self.theano_rng.binomial(size=h1_mean.shape,n=1,p=h1_mean,dtype=theano.config.floatX)
- return [pre_sigmoid_h1, h1_mean, h1_sample]
- #后向传导,从隐藏层到可视层,计算p(v=1|h)
- def propdown(self, hid):
- pre_sigmoid_activation = T.dot(hid, self.W.T) + self.vbias
- return [pre_sigmoid_activation, T.nnet.sigmoid(pre_sigmoid_activation)]
- #根据隐藏层的状态计算可视层状态
- def sample_v_given_h(self, h0_sample):
- pre_sigmoid_v1, v1_mean = self.propdown(h0_sample)
- v1_sample = self.theano_rng.binomial(size=v1_mean.shape,
- n=1, p=v1_mean,
- dtype=theano.config.floatX)
- return [pre_sigmoid_v1, v1_mean, v1_sample]
- #计算从隐藏层-》可视层-》隐藏层的一个状态转移过程,相当于一次的Gibbs sampling采样
- def gibbs_hvh(self, h0_sample):
- pre_sigmoid_v1, v1_mean, v1_sample = self.sample_v_given_h(h0_sample)
- pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v1_sample)
- return [pre_sigmoid_v1, v1_mean, v1_sample,
- pre_sigmoid_h1, h1_mean, h1_sample]
- #计算从可视层-》隐藏层-》可视层的状态转移过程,相当于一次的Gibbs sampling采样
- def gibbs_vhv(self, v0_sample):
- pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v0_sample)
- pre_sigmoid_v1, v1_mean, v1_sample = self.sample_v_given_h(h1_sample)
- return [pre_sigmoid_h1, h1_mean, h1_sample,
- pre_sigmoid_v1, v1_mean, v1_sample]
- # k用于设置Gibbs sampling采样次数,也就是相当于来回跑了多少次(来回一趟算一次)
- def get_cost_updates(self, lr=0.1, persistent=None, k=1):
- # 当我们输入数据的时候,首先根据输入x,计算隐藏层的概率分布,概率分布的采样结果
- pre_sigmoid_ph, ph_mean, ph_sample = self.sample_h_given_v(self.input)
- # decide how to initialize persistent chain:
- # for CD, we use the newly generate hidden sample
- # for PCD, we initialize from the old state of the chain
- if persistent is None:
- chain_start = ph_sample
- else:
- chain_start = persistent
- #让函数来回跑k次
- (
- [
- pre_sigmoid_nvs,
- nv_means,
- nv_samples,
- pre_sigmoid_nhs,
- nh_means,
- nh_samples
- ],
- updates
- ) = theano.scan(self.gibbs_hvh,outputs_info=[None, None, None, None, None, chain_start],n_steps=k)
- #拿到最后一次循环的状态
- chain_end = nv_samples[-1]
- #构造损失函数
- cost = T.mean(self.free_energy(self.input)) - T.mean(
- self.free_energy(chain_end))
- #计算梯度,然后进行梯度下降更新
- gparams = T.grad(cost, self.params, consider_constant=[chain_end])
- for gparam, param in zip(gparams, self.params):
- updates[param] = param - gparam * T.cast(lr,dtype=theano.config.floatX)
- if persistent:
- # Note that this works only if persistent is a shared variable
- updates[persistent] = nh_samples[-1]
- # pseudo-likelihood is a better proxy for PCD
- monitoring_cost = self.get_pseudo_likelihood_cost(updates)
- else:
- # reconstruction cross-entropy is a better proxy for CD
- monitoring_cost = self.get_reconstruction_cost(updates,pre_sigmoid_nvs[-1])
- return monitoring_cost, updates
- # end-snippet-4
- def get_pseudo_likelihood_cost(self, updates):
- """Stochastic approximation to the pseudo-likelihood"""
- # index of bit i in expression p(x_i | x_{\i})
- bit_i_idx = theano.shared(value=0, name='bit_i_idx')
- # binarize the input image by rounding to nearest integer
- xi = T.round(self.input)
- # calculate free energy for the given bit configuration
- fe_xi = self.free_energy(xi)
- # flip bit x_i of matrix xi and preserve all other bits x_{\i}
- # Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx], but assigns
- # the result to xi_flip, instead of working in place on xi.
- xi_flip = T.set_subtensor(xi[:, bit_i_idx], 1 - xi[:, bit_i_idx])
- # calculate free energy with bit flipped
- fe_xi_flip = self.free_energy(xi_flip)
- # equivalent to e^(-FE(x_i)) / (e^(-FE(x_i)) + e^(-FE(x_{\i})))
- cost = T.mean(self.n_visible * T.log(T.nnet.sigmoid(fe_xi_flip -
- fe_xi)))
- # increment bit_i_idx % number as part of updates
- updates[bit_i_idx] = (bit_i_idx + 1) % self.n_visible
- return cost
- def get_reconstruction_cost(self, updates, pre_sigmoid_nv):
- """Approximation to the reconstruction error
- Note that this function requires the pre-sigmoid activation as
- input. To understand why this is so you need to understand a
- bit about how Theano works. Whenever you compile a Theano
- function, the computational graph that you pass as input gets
- optimized for speed and stability. This is done by changing
- several parts of the subgraphs with others. One such
- optimization expresses terms of the form log(sigmoid(x)) in
- terms of softplus. We need this optimization for the
- cross-entropy since sigmoid of numbers larger than 30. (or
- even less then that) turn to 1. and numbers smaller than
- -30. turn to 0 which in terms will force theano to compute
- log(0) and therefore we will get either -inf or NaN as
- cost. If the value is expressed in terms of softplus we do not
- get this undesirable behaviour. This optimization usually
- works fine, but here we have a special case. The sigmoid is
- applied inside the scan op, while the log is
- outside. Therefore Theano will only see log(scan(..)) instead
- of log(sigmoid(..)) and will not apply the wanted
- optimization. We can not go and replace the sigmoid in scan
- with something else also, because this only needs to be done
- on the last step. Therefore the easiest and more efficient way
- is to get also the pre-sigmoid activation as an output of
- scan, and apply both the log and sigmoid outside scan such
- that Theano can catch and optimize the expression.
- """
- cross_entropy = T.mean(
- T.sum(
- self.input * T.log(T.nnet.sigmoid(pre_sigmoid_nv)) +
- (1 - self.input) * T.log(1 - T.nnet.sigmoid(pre_sigmoid_nv)),
- axis=1
- )
- )
- return cross_entropy
- #
- def test_rbm(learning_rate=0.1, training_epochs=15,
- dataset='mnist.pkl.gz', batch_size=20,
- n_chains=20, n_samples=10, output_folder='rbm_plots',
- n_hidden=500):
- datasets = load_data(dataset)
- train_set_x, train_set_y = datasets[0]
- test_set_x, test_set_y = datasets[2]
- #计算训练的批数
- n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
- index = T.lscalar() # index to a [mini]batch
- x = T.matrix('x') # the data is presented as rasterized images
- rng = numpy.random.RandomState(123)
- theano_rng = RandomStreams(rng.randint(2 ** 30))
- persistent_chain = theano.shared(numpy.zeros((batch_size, n_hidden),dtype=theano.config.floatX),borrow=True)
- #网络构建 ,可视层神经元个数为28*28,隐藏层神经元为500
- rbm = RBM(input=x, n_visible=28 * 28,n_hidden=n_hidden, numpy_rng=rng, theano_rng=theano_rng)
- # get the cost and the gradient corresponding to one step of CD-15
- cost, updates = rbm.get_cost_updates(lr=learning_rate,persistent=persistent_chain, k=15)
- #################################
- # Training the RBM #
- #################################
- if not os.path.isdir(output_folder):
- os.makedirs(output_folder)
- os.chdir(output_folder)
- # start-snippet-5
- # it is ok for a theano function to have no output
- # the purpose of train_rbm is solely to update the RBM parameters
- train_rbm = theano.function([index],cost,updates=updates,givens={x: train_set_x[index * batch_size: (index + 1) * batch_size]},name='train_rbm')
- plotting_time = 0.
- start_time = timeit.default_timer()
- # go through training epochs
- for epoch in xrange(training_epochs):
- # go through the training set
- mean_cost = []
- for batch_index in xrange(n_train_batches):
- mean_cost += [train_rbm(batch_index)]
- print 'Training epoch %d, cost is ' % epoch, numpy.mean(mean_cost)
- # Plot filters after each training epoch
- plotting_start = timeit.default_timer()
- # Construct image from the weight matrix
- image = Image.fromarray(
- tile_raster_images(
- X=rbm.W.get_value(borrow=True).T,
- img_shape=(28, 28),
- tile_shape=(10, 10),
- tile_spacing=(1, 1)
- )
- )
- image.save('filters_at_epoch_%i.png' % epoch)
- plotting_stop = timeit.default_timer()
- plotting_time += (plotting_stop - plotting_start)
- end_time = timeit.default_timer()
- pretraining_time = (end_time - start_time) - plotting_time
- print ('Training took %f minutes' % (pretraining_time / 60.))
- # end-snippet-5 start-snippet-6
- #################################
- # Sampling from the RBM #
- #################################
- # find out the number of test samples
- number_of_test_samples = test_set_x.get_value(borrow=True).shape[0]
- # pick random test examples, with which to initialize the persistent chain
- test_idx = rng.randint(number_of_test_samples - n_chains)
- persistent_vis_chain = theano.shared(
- numpy.asarray(
- test_set_x.get_value(borrow=True)[test_idx:test_idx + n_chains],
- dtype=theano.config.floatX
- )
- )
- # end-snippet-6 start-snippet-7
- plot_every = 1000
- # define one step of Gibbs sampling (mf = mean-field) define a
- # function that does `plot_every` steps before returning the
- # sample for plotting
- (
- [
- presig_hids,
- hid_mfs,
- hid_samples,
- presig_vis,
- vis_mfs,
- vis_samples
- ],
- updates
- ) = theano.scan(
- rbm.gibbs_vhv,
- outputs_info=[None, None, None, None, None, persistent_vis_chain],
- n_steps=plot_every
- )
- # add to updates the shared variable that takes care of our persistent
- # chain :.
- updates.update({persistent_vis_chain: vis_samples[-1]})
- # construct the function that implements our persistent chain.
- # we generate the "mean field" activations for plotting and the actual
- # samples for reinitializing the state of our persistent chain
- sample_fn = theano.function(
- [],
- [
- vis_mfs[-1],
- vis_samples[-1]
- ],
- updates=updates,
- name='sample_fn'
- )
- # create a space to store the image for plotting ( we need to leave
- # room for the tile_spacing as well)
- image_data = numpy.zeros(
- (29 * n_samples + 1, 29 * n_chains - 1),
- dtype='uint8'
- )
- for idx in xrange(n_samples):
- # generate `plot_every` intermediate samples that we discard,
- # because successive samples in the chain are too correlated
- vis_mf, vis_sample = sample_fn()
- print ' ... plotting sample ', idx
- image_data[29 * idx:29 * idx + 28, :] = tile_raster_images(
- X=vis_mf,
- img_shape=(28, 28),
- tile_shape=(1, n_chains),
- tile_spacing=(1, 1)
- )
- # construct image
- image = Image.fromarray(image_data)
- image.save('samples.png')
- # end-snippet-7
- os.chdir('../')
- if __name__ == '__main__':
- test_rbm()
from: http://blog.csdn.net/hjimce/article/details/48949197