保姆级教程吴恩达机器学习ex5Matlab代码解析

youknowwho3o3

于 2022-09-27 16:20:22 发布

阅读量815

点赞数

分类专栏：机器学习 Coursera machinelearning

本文链接：https://blog.csdn.net/weixin_41460195/article/details/127072973

版权

机器学习同时被 3 个专栏收录

8 篇文章

订阅专栏

Coursera

7 篇文章

订阅专栏

machinelearning

7 篇文章

订阅专栏

%%Machine learning From Andrew

%%By youknowwho3_3 in CSDN #GirlsHelpGirls#DOUBANEZU

%%Regularized Linear Regression and Bias vs. Variance

%implement the backpropagation algorithm for neural networks and

%apply it to the task of hand-written digit recognition

%1. Regularized Linear Regression

% 1.1 Visualizing the dataset

% 1.2 Regularized linear regression cost function

% 1.3 Regularized linear regression gradient

% 1.4 Fitting linear regression

%2. Bias-variance

% 2.1 Learning curves

%3. Polynomial regression

% 3.1 Learning Polynomial Regression

% 3.2 Optional (ungraded) exercise: Adjusting the regularization parameter

% 3.3 Selecting lambda using a cross validation set

% 3.4 Optional (ungraded) exercise: Computing test set error

% 3.5 Optional (ungraded) exercise: Plotting learning curves with randomly selected examples

%%





%%1. Regularized Linear Regression

%%1.1 Visualizing the dataset



% Load from ex5data1:

% You will have X, y, Xval, yval, Xtest, ytest in your environment

load ('ex5data1.mat');

%size(X)=12*1

%size(Xtest)=21*1

%size(Xval)=21*1

%size(y)=12*1

%size(ytest)=21*1

%size(yval)=21*1



% m = Number of examples

m = size(X, 1);



% Plot training data

figure;

plot(X, y, 'rx', 'MarkerSize', 10, 'LineWidth', 1.5);

xlabel('Change in water level (x)');

ylabel('Water flowing out of the dam (y)');



%%1.2 Regularized linear regression cost function

theta = [1 ; 1];%size(theta)=2*1

J = linearRegCostFunction([ones(m, 1) X], y, theta, 1);

fprintf('The Expected Cost at theta = [1 ; 1]: 303.993192');

fprintf('Cost at theta = [1 ; 1]: %f', J);





%%1.3 Regularized linear regression gradient

[J, Grad] = linearRegCostFunction([ones(m, 1) X], y, theta, 1);

fprintf(['The Expected Gradient at theta = [1 ; 1]: [-15.30; 598.250]\n']);

fprintf('Gradient at theta = [1 ; 1]: [%f; %f]\n',Grad(1), Grad(2));





%%1.4 Fitting linear regression

% Train linear regression with lambda = 0

lambda = 0;

[theta] = trainLinearReg([ones(m, 1) X], y, lambda);



% Plot fit over the data

figure;

plot(X, y, 'rx', 'MarkerSize', 10, 'LineWidth', 1.5);

xlabel('Change in water level (x)');

ylabel('Water flowing out of the dam (y)');

hold on;

plot(X, [ones(m, 1) X]*theta,'b--', 'LineWidth', 2)

%blue的线这么写

hold off;



%%2. Bias-variance

%%2.1 Learning curves

lambda = 0;

[error_train, error_val] = learningCurve([ones(m, 1) X], y, [ones(size(Xval, 1), 1) Xval], yval, lambda);



plot(1:m, error_train, 1:m, error_val);

title('Learning curve for linear regression')

legend('Train', 'Cross Validation')

xlabel('Number of training examples')

ylabel('Error')

axis([0 13 0 150])



fprintf('# Training Examples\tTrain Error\tCross Validation Error\n');

for i = 1:m

fprintf(' \t%d\t\t%f\t%f\n', i, error_train(i), error_val(i));

end





%%3. Polynomial regression多项式回归

p = 8;



% Map X onto Polynomial Features and Normalize

X_poly = polyFeatures(X, p);

[X_poly, mu, sigma] = featureNormalize(X_poly); % Normalize

X_poly = [ones(m, 1), X_poly]; % Add Ones



% Map X_poly_test and normalize (using mu and sigma)

X_poly_test = polyFeatures(Xtest, p);

X_poly_test = X_poly_test-mu; % uses implicit expansion instead of bsxfun

X_poly_test = X_poly_test./sigma; % uses implicit expansion instead of bsxfun

X_poly_test = [ones(size(X_poly_test, 1), 1), X_poly_test]; % Add Ones



% Map X_poly_val and normalize (using mu and sigma)

X_poly_val = polyFeatures(Xval, p);

X_poly_val = X_poly_val-mu; % uses implicit expansion instead of bsxfun

X_poly_val = X_poly_val./sigma; % uses implicit expansion instead of bsxfun

X_poly_val = [ones(size(X_poly_val, 1), 1), X_poly_val]; % Add Ones



fprintf('Normalized Training Example 1:\n');

fprintf(' %f \n', X_poly(1, :));



% Train the model

lambda = 0;

[theta] = trainLinearReg(X_poly, y, lambda);



% Plot training data and fit

plot(X, y, 'rx', 'MarkerSize', 10, 'LineWidth', 1.5);

plotFit(min(X), max(X), mu, sigma, theta, p);

xlabel('Change in water level (x)');

ylabel('Water flowing out of the dam (y)');

title (sprintf('Polynomial Regression Fit (lambda = %f)', lambda));

[error_train, error_val] = learningCurve(X_poly, y, X_poly_val, yval, lambda);

plot(1:m, error_train, 1:m, error_val);

title(sprintf('Polynomial Regression Learning Curve (lambda = %f)', lambda));

xlabel('Number of training examples')

ylabel('Error')

axis([0 13 0 100])

legend('Train', 'Cross Validation')



[lambda_vec, error_train, error_val] = validationCurve(X_poly, y, X_poly_val, yval);

plot(lambda_vec, error_train, lambda_vec, error_val);

legend('Train', 'Cross Validation');

xlabel('lambda');

ylabel('Error');

for i = 1:length(lambda_vec)

if i == 1

fprintf('lambda\t\tTrain Error\tValidation Error\n');

end

fprintf('%f\t%f\t%f\n',lambda_vec(i), error_train(i), error_val(i));

end





%function validationCurve.m



function [lambda_vec, error_train, error_val]= validationCurve(X, y, Xval, yval)

% Selected values of lambda (you should not change this)

lambda_vec = [0 0.001 0.003 0.01 0.03 0.1 0.3 1 3 10]';



% You need to return these variables correctly.

error_train = zeros(length(lambda_vec), 1);

error_val = zeros(length(lambda_vec), 1);



for i = 1:length(lambda_vec)

theta = trainLinearReg(X, y, lambda_vec(i));

error_train(i) = linearRegCostFunction(X, y, theta, 0);

error_val(i) = linearRegCostFunction(Xval, yval, theta, 0);

end

end

%Hypothesis of Polynomial regression:

$h_\theta(x) =\theta_0+\theta_1\ast (waterLevel)+\theta_2* (waterLevel)^2+\cdots+\theta_p\ast (waterLevel)^p \\ \qquad\;$

$=\theta_0+\theta_1 x_1+\theta_2 x_2+\cdots+\theta_p x_p$

%function polyFeatures.m

function X_poly=polyFeatures(X, p)

%Specifically, when a training set of size is passed into the function,

% the function should return a matrix X_poly,

% where column 1 holds the original values of X,

% column 2 holds the values of X.^2, column 3 holds the values of X.^3, and so on.

% Note that you don't have to account for the zero-th power in this function.

% Now that you have a function that will map features to a higher dimension, the code in the next section will apply it to the training set, the test set, and the cross validation set (which you haven't used yet).



X_poly=X;



for i=2:p

X_poly=[X_poly,X.^i];

end

%{

% You need to return the following variables correctly.

X_poly = zeros(numel(X), p);



% ====================== YOUR CODE HERE ======================

% Instructions: Given a vector X, return a matrix X_poly where the p-th

% column of X contains the values of X to the p-th power.

for i = 1:p

X_poly(:,i) = X.^i;

end

%}

end

Regularized Linear Regression Cost Function:

$J\left(\theta \right)=\frac{1}{2m}\left(\sum_{i=1}^m {\left(h_{\theta \;} \left(x^{\left(i\right)} \right)-y^{\left(i\right)} \right)}^2 \right)+\frac{\;\lambda }{2m}\left(\sum_{j=1}^n \theta_j^2 \right)$

$h_{\theta \;} \left(x\right)=\theta^T X$

The partial derivative(偏导数) of Regularized Linear Regression's cost for

$\frac{\partial J\left(\theta \right)}{\partial \theta_0 }=\frac{\;1}{m}\sum_{i=1}^m \;\left(h_{\theta \;} \left(x^{\left(i\right)} \right)-y^{\left(i\right)} \right)x^{\left(i\right)} \;\;\mathrm{for}\;\;j=0$

$\frac{\partial J\left(\theta \right)}{\partial \theta_j }=\left(\frac{\;1}{m}\sum_{i=1}^m \;\left(h_{\theta \;} \left(x^{\left(i\right)} \right)-y^{\left(i\right)} \right)x_j^{\left(i\right)} \right)+\frac{\lambda }{m}\theta_j \;\;\;\mathrm{for}\;\;j>0$

%function linearRegCostFunction

function [J,Grad] = linearRegCostFunction(X, y, theta, lambda)

m=length(y);%m=12 size(X)=12*2

h=X*theta;%size(h)=12*1

J=0;

Grad = zeros(size(theta));



J=1/(2*m)*sum((h-y).^2)+lambda/(2*m)*sum(theta(2:end).^2);

%size((h-y).^2)=12*1 所以要sum

%theta(2:end) theta第二行到最后



Grad(1) = (1/m)*(X(:,1)'*(h-y)); % scalar == 1x1

Grad(2:end) = (1/m)*(X(:,2:end)'*(h-y))+(lambda/m)*theta(2:end); % n x 1

%Grad=1/m*(h-y)'*X+lambda/m*theta(2:end);

%if j=0 即theta=1*n,theta(2:end)=0,即*0,但是X却是所有的j

Grad=Grad(:);



end







%function learningCurve

function [error_train,error_val] = learningCurve(X, y, Xval, yval, lambda)

m=size(X,1);

error_train=zeros(m,1);

error_val=zeros(m,1);



for i = 1:m%用多少个数据来train

theta = trainLinearReg(X(1:i,:), y(1:i), lambda);

error_train(i) = linearRegCostFunction(X(1:i,:), y(1:i), theta, 0);

error_val(i) = linearRegCostFunction(Xval, yval, theta, 0);



end



end





function [theta] = trainLinearReg(X, y, lambda)

%TRAINLINEARREG Trains linear regression given a dataset (X, y) and a

%regularization parameter lambda

% [theta] = TRAINLINEARREG (X, y, lambda) trains linear regression using

% the dataset (X, y) and regularization parameter lambda. Returns the

% trained parameters theta.

%



% Initialize Theta

%size(X)=12*2

initial_theta = zeros(size(X, 2), 1);



% Create "short hand" for the cost function to be minimized

costFunction = @(t)linearRegCostFunction(X, y, t, lambda);



% Now, costFunction is a function that takes in only one argument

options = optimset('MaxIter', 200, 'GradObj', 'on');



% Minimize using fmincg

theta = fmincg(costFunction, initial_theta, options);



end





function [X, fX, i] = fmincg(f, X, options, P1, P2, P3, P4, P5)

% Minimize a continuous differentialble multivariate function. Starting point

% is given by "X" (D by 1), and the function named in the string "f", must

% return a function value and a vector of partial derivatives. The Polack-

% Ribiere flavour of conjugate gradients is used to compute search directions,

% and a line search using quadratic and cubic polynomial approximations and the

% Wolfe-Powell stopping criteria is used together with the slope ratio method

% for guessing initial step sizes. Additionally a bunch of checks are made to

% make sure that exploration is taking place and that extrapolation will not

% be unboundedly large. The "length" gives the length of the run: if it is

% positive, it gives the maximum number of line searches, if negative its

% absolute gives the maximum allowed number of function evaluations. You can

% (optionally) give "length" a second component, which will indicate the

% reduction in function value to be expected in the first line-search (defaults

% to 1.0). The function returns when either its length is up, or if no further

% progress can be made (ie, we are at a minimum, or so close that due to

% numerical problems, we cannot get any closer). If the function terminates

% within a few iterations, it could be an indication that the function value

% and derivatives are not consistent (ie, there may be a bug in the

% implementation of your "f" function). The function returns the found

% solution "X", a vector of function values "fX" indicating the progress made

% and "i" the number of iterations (line searches or function evaluations,

% depending on the sign of "length") used.

%

% Usage: [X, fX, i] = fmincg(f, X, options, P1, P2, P3, P4, P5)

%

% See also: checkgrad

%

% Copyright (C) 2001 and 2002 by Carl Edward Rasmussen. Date 2002-02-13

%

%

% (C) Copyright 1999, 2000 & 2001, Carl Edward Rasmussen

%

% Permission is granted for anyone to copy, use, or modify these

% programs and accompanying documents for purposes of research or

% education, provided this copyright notice is retained, and note is

% made of any changes that have been made.

%

% These programs and documents are distributed without any warranty,

% express or implied. As the programs were written for research

% purposes only, they have not been tested to the degree that would be

% advisable in any important application. All use of these programs is

% entirely at the user's own risk.

%

% [ml-class] Changes Made:

% 1) Function name and argument specifications

% 2) Output display

%



% Read options

if exist('options', 'var') && ~isempty(options) && isfield(options, 'MaxIter')

length = options.MaxIter;

else

length = 100;

end





RHO = 0.01; % a bunch of constants for line searches

SIG = 0.5; % RHO and SIG are the constants in the Wolfe-Powell conditions

INT = 0.1; % don't reevaluate within 0.1 of the limit of the current bracket

EXT = 3.0; % extrapolate maximum 3 times the current bracket

MAX = 20; % max 20 function evaluations per line search

RATIO = 100; % maximum allowed slope ratio



argstr = ['feval(f, X']; % compose string used to call function



for i = 1:(nargin - 3)

argstr = [argstr, ',P', int2str(i)];

end

argstr = [argstr, ')'];



if max(size(length)) == 2

red=length(2);

length=length(1);

else red=1;

end

S=['Iteration '];



i = 0; % zero the run length counter

ls_failed = 0; % no previous line search has failed

fX = [];

[f1 df1] = eval(argstr); % get function value and gradient

i = i + (length<0); % count epochs?!

s = -df1; % search direction is steepest

d1 = -s'*s; % this is the slope

z1 = red/(1-d1); % initial step is red/(|s|+1)



while i < abs(length) % while not finished

i = i + (length>0); % count iterations?!



X0 = X; f0 = f1; df0 = df1; % make a copy of current values

X = X + z1*s; % begin line search

[f2 df2] = eval(argstr);

i = i + (length<0); % count epochs?!

d2 = df2'*s;

f3 = f1; d3 = d1; z3 = -z1; % initialize point 3 equal to point 1

if length>0, M = MAX; else M = min(MAX, -length-i); end

success = 0; limit = -1; % initialize quanteties

while 1

while ((f2 > f1+z1*RHO*d1) || (d2 > -SIG*d1)) && (M > 0)

limit = z1; % tighten the bracket

if f2 > f1

z2 = z3 - (0.5*d3*z3*z3)/(d3*z3+f2-f3); % quadratic fit

else

A = 6*(f2-f3)/z3+3*(d2+d3); % cubic fit

B = 3*(f3-f2)-z3*(d3+2*d2);

z2 = (sqrt(B*B-A*d2*z3*z3)-B)/A; % numerical error possible - ok!

end

if isnan(z2) || isinf(z2)

z2 = z3/2; % if we had a numerical problem then bisect

end

z2 = max(min(z2, INT*z3),(1-INT)*z3); % don't accept too close to limits

z1 = z1 + z2; % update the step

X = X + z2*s;

[f2 df2] = eval(argstr);

M = M - 1; i = i + (length<0); % count epochs?!

d2 = df2'*s;

z3 = z3-z2; % z3 is now relative to the location of z2

end

if f2 > f1+z1*RHO*d1 || d2 > -SIG*d1

break; % this is a failure

elseif d2 > SIG*d1

success = 1; break; % success

elseif M == 0

break; % failure

end

A = 6*(f2-f3)/z3+3*(d2+d3); % make cubic extrapolation

B = 3*(f3-f2)-z3*(d3+2*d2);

z2 = -d2*z3*z3/(B+sqrt(B*B-A*d2*z3*z3)); % num. error possible - ok!

if ~isreal(z2) || isnan(z2) || isinf(z2) || z2 < 0 % num prob or wrong sign?

if limit < -0.5 % if we have no upper limit

z2 = z1 * (EXT-1); % the extrapolate the maximum amount

else

z2 = (limit-z1)/2; % otherwise bisect

end

elseif (limit > -0.5) && (z2+z1 > limit) % extraplation beyond max?

z2 = (limit-z1)/2; % bisect

elseif (limit < -0.5) && (z2+z1 > z1*EXT) % extrapolation beyond limit

z2 = z1*(EXT-1.0); % set to extrapolation limit

elseif z2 < -z3*INT

z2 = -z3*INT;

elseif (limit > -0.5) && (z2 < (limit-z1)*(1.0-INT)) % too close to limit?

z2 = (limit-z1)*(1.0-INT);

end

f3 = f2; d3 = d2; z3 = -z2; % set point 3 equal to point 2

z1 = z1 + z2; X = X + z2*s; % update current estimates

[f2 df2] = eval(argstr);

M = M - 1; i = i + (length<0); % count epochs?!

d2 = df2'*s;

end % end of line search



if success % if line search succeeded

f1 = f2; fX = [fX' f1]';

fprintf('%s %4i | Cost: %4.6e\r', S, i, f1);

s = (df2'*df2-df1'*df2)/(df1'*df1)*s - df2; % Polack-Ribiere direction

tmp = df1; df1 = df2; df2 = tmp; % swap derivatives

d2 = df1'*s;

if d2 > 0 % new slope must be negative

s = -df1; % otherwise use steepest direction

d2 = -s'*s;

end

z1 = z1 * min(RATIO, d1/(d2-realmin)); % slope ratio but max RATIO

d1 = d2;

ls_failed = 0; % this line search did not fail

else

X = X0; f1 = f0; df1 = df0; % restore point from before failed line search

if ls_failed || i > abs(length) % line search failed twice in a row

break; % or we ran out of time, so we give up

end

tmp = df1; df1 = df2; df2 = tmp; % swap derivatives

s = -df1; % try steepest

d1 = -s'*s;

z1 = 1/(1-d1);

ls_failed = 1; % this line search failed

end

if exist('OCTAVE_VERSION')

fflush(stdout);

end

end

fprintf('\n');

end



function [X_norm, mu, sigma] = featureNormalize(X)

%FEATURENORMALIZE Normalizes the features in X

% FEATURENORMALIZE(X) returns a normalized version of X where

% the mean value of each feature is 0 and the standard deviation

% is 1. This is often a good preprocessing step to do when

% working with learning algorithms.



mu = mean(X);

X_norm = bsxfun(@minus, X, mu);



sigma = std(X_norm);

X_norm = bsxfun(@rdivide, X_norm, sigma);





% ============================================================



end

function plotFit(min_x, max_x, mu, sigma, theta, p)

%PLOTFIT Plots a learned polynomial regression fit over an existing figure.

%Also works with linear regression.

% PLOTFIT(min_x, max_x, mu, sigma, theta, p) plots the learned polynomial

% fit with power p and feature normalization (mu, sigma).



% Hold on to the current figure

hold on;



% We plot a range slightly bigger than the min and max values to get

% an idea of how the fit will vary outside the range of the data points

x = (min_x - 15: 0.05 : max_x + 25)';



% Map the X values

X_poly = polyFeatures(x, p);

X_poly = bsxfun(@minus, X_poly, mu);

X_poly = bsxfun(@rdivide, X_poly, sigma);



% Add ones

X_poly = [ones(size(x, 1), 1) X_poly];



% Plot

plot(x, X_poly * theta, '--', 'LineWidth', 2)



% Hold off to the current figure

hold off



end