# 第一步

  discrim = 1;    % 1 to activate the discriminative learning, 0 otherwise
freeenerg = 1;  % if discrim is zero but one want to compute the free energy

1、生成模型的训练目标函数

2、判别模型的训练目标函数

3、混合模型的训练目标函数

data_valid = data(end - valid_size + 1 : end, : );
label_valid = labels(end - valid_size + 1 : end, : );

%convert y in the notetion one out of number of classes
lab = boolean(set_y(labels(1:end - valid_size,:), num_class));
%convert the dataset in batches
[batch_data, batch_labels , batch_size] = ...
createBatches(data(1:end - valid_size,:), lab, num_batch);

# 第二步

  sum_rec_error = 0;
sum_class_error = 0;
sum_tot_free_energ = 0;

 if  i_epoch == 5
momentum = final_momentum;
end;

%simulated annealing
gamma = (init_gamma) / (1 + (i_epoch - 1) / tau);

# 第三步

  x = batch_data{iBatch}';
y = batch_labels{iBatch}';
batch_size=size(x,2);

 hprob = prob_h_given_xy(x, y, w, u, b_h);


function [ prob ] = prob_h_given_xy( x, y, w, u, b_h )
<span style="white-space:pre">	</span>prob = sigmf(bsxfun(@plus, w * x + u * y, b_h), [1 0]);
end

h = sampling(hprob);

function [ samp ] = sampling( p )
<span style="white-space:pre">	</span>samp = p > rand(size(p));
end

function [ prob ] = prob_x_given_h( h, w, b_x )
<span style="white-space:pre">	</span>prob = sig(bsxfun(@plus, w' * h, b_x));
end



function [ prob ] = prob_y_given_h( h, u , b_y )
<span style="white-space:pre">	</span>prob_not_norm = exp(bsxfun(@plus, u' * h, b_y));
<span style="white-space:pre">	</span>norm_factor = sum(prob_not_norm);
<span style="white-space:pre">	</span>prob = bsxfun(@rdivide, prob_not_norm, norm_factor);
end


xrprob = prob_x_given_h( h, w, b_x );
yrprob = prob_y_given_h( h, u, b_y );
xr = sampling(xrprob);
yr = sampling_y(yrprob);

hrprob = prob_h_given_xy( xr, yr, w, u, b_h);

g_w_gen = (hprob * x' - hrprob * xrprob') / batch_size;
g_u_gen = (hprob * y' - hrprob * yrprob') / batch_size;
g_b_x_gen = (sum(x,2) - sum(xrprob,2)) / batch_size;
g_b_y_gen = (sum(y,2) - sum(yrprob,2)) / batch_size;
g_b_h_gen = (sum(hprob,2) - sum(hrprob,2)) / batch_size;

# 第四步

    if (discrim || freeenerg)
g_b_h_disc_acc = 0;
g_w_disc_acc = 0;
g_u_disc_acc = zeros(h_size,num_class);

①首先发现式子中始终有w*x这一项，所以提出来计算一下，避免后面的重复计算

      %ausiliary variables
w_times_x = w * x;
②接下来计算自由能量函数

 for iClass= 1:num_class
o_t_y(:,:,iClass) = bsxfun(@plus, b_h + u(:,iClass), w_times_x );
neg_free_energ(iClass,:) = b_y(iClass) + sum(log(1 + exp(o_t_y(:,:,iClass))));
end;

o_t_y维度大小是800*716*30，分别代表隐单元神经元个数、一批样本数、标签总数，就像对隐层求了30个800*716的单元数值，然后后面计算的时候也确实如此，每次计算都用了一个for循环对第三个维度的800*716矩阵分别遍历，应该就是为了计算此样本属于每一个标签的概率。

neg_free_energ大小是30*716，分别代表标签总数、样本总数

sum_tot_free_energ = sum_tot_free_energ + sum(-neg_free_energ(y));

③接下来就是计算discrim对应的参数更新梯度了

        med = mean2(neg_free_energ);
p_y_given_x = exp(neg_free_energ - med);

p_y_given_x = bsxfun(@rdivide, p_y_given_x, sum(p_y_given_x));
④最后就是计算梯度了

        dif_y_p_y = y - p_y_given_x;
g_b_y_disc_acc = sum(dif_y_p_y,2);

sig_o_t_y = sig(o_t_y(:,:,iClass));
sig_o_t_y_selected = sig_o_t_y(:,y(iClass,:)); 

g_b_h_disc_acc = g_b_h_disc_acc + sum(sig_o_t_y_selected, 2) - sum(bsxfun(@times, sig_o_t_y, p_y_given_x(iClass,:)),2);
%update the gradient of w fot the class 'iClass'
g_w_disc_acc = g_w_disc_acc + sig_o_t_y_selected * x(:,y(iClass,:))' - bsxfun(@times, sig_o_t_y, p_y_given_x(iClass,:)) * x';
%update the gradient of u fot the class 'iClass'
g_u_disc_acc(:,iClass) = sum(bsxfun(@times, sig_o_t_y, dif_y_p_y(iClass,:)), 2);

  for iClass= 1:num_class

⑤全部计算完了，就更新一下判别模型中的几个参数吧

g_b_y_disc = g_b_y_disc_acc / batch_size;
g_b_h_disc = g_b_h_disc_acc / batch_size;
g_w_disc = g_w_disc_acc / batch_size;
g_u_disc = g_u_disc_acc / batch_size;

# 第五步

 delta_w = delta_w * momentum + ...
gamma * (discrim * g_w_disc + alpha * g_w_gen - weight_cost * w);
delta_u = delta_u * momentum + ...
gamma * (discrim * g_u_disc + alpha * g_u_gen - weight_cost * u);
delta_b_x = delta_b_x * momentum + ...
gamma * alpha * g_b_x_gen;
delta_b_y = delta_b_y * momentum + ...
gamma * (discrim * g_b_y_disc + alpha * g_b_y_gen);
delta_b_h = delta_b_h * momentum + ...
gamma * (discrim * g_b_h_disc + alpha * g_b_h_gen);
w = w + delta_w;
u = u + delta_u;
b_x = b_x + delta_b_x;
b_y = b_y + delta_b_y;
b_h = b_h + delta_b_h;

# 第六步

function [ err ] = predict( testdata, testlabels, num_class, b_y, b_h, w, u )
numcases= size(testdata, 1);
w_times_x = w * testdata';
neg_free_energ = zeros(num_class, numcases);
for iClass= 1:num_class
neg_free_energ(iClass,:) = b_y(iClass) + sum(log(1 + ...
exp(bsxfun(@plus, b_h + u(:,iClass), w_times_x ))));
end;
[~, prediction] = max(neg_free_energ);
err = sum(prediction ~= testlabels') / numcases * 100;

end