function [coeff, score, latent,
tsquare] = princomp(x,econFlag)
%PRINCOMP Principal Components
Analysis.
% COEFF =
PRINCOMP(X) performs principal components analysis on the
N-by-P
% data
matrix X, and returns the principal component coefficients,
also
% known as
loadings. Rows of X correspond to
observations, columns to
% variables. COEFF is a P-by-P matrix, each
column containing coefficients
% for one
principal component. The columns are in
order of decreasing
% component
variance.
%
% PRINCOMP
centers X by subtracting off column means, but does not
% rescale the
columns of X. To perform PCA with
standardized variables,
% i.e., based on
correlations, use PRINCOMP(ZSCORE(X)). To
perform PCA
% directly on a
covariance or correlation matrix, use PCACOV.
%
% [COEFF, SCORE]
= PRINCOMP(X) returns the principal component scores,
% i.e., the
representation of X in the principal component
space. Rows
% of SCORE
correspond to observations, columns to components.
%
% [COEFF, SCORE,
LATENT] = PRINCOMP(X) returns the principal component
% variances,
i.e., the eigenvalues of the covariance matrix of X, in
% LATENT.
%
% [COEFF, SCORE,
LATENT, TSQUARED] = PRINCOMP(X) returns Hotelling's
% T-squared
statistic for each observation in X.
%
% When N
<= P, SCORE(:,N:P) and LATENT(N:P) are necessarily
zero, and the
% columns of
COEFF(:,N:P) define directions that are orthogonal to X.
%
% [...] =
PRINCOMP(X,'econ') returns only the elements of LATENT that
are
% not necessarily
zero, i.e., when N <= P, only the first N-1,
and the
% corresponding
columns of COEFF and SCORE. This can be
significantly
% faster when P
>> N.
%
% See also
BARTTEST, BIPLOT, CANONCORR, FACTORAN, PCACOV, PCARES,
ROTATEFACTORS.
% References:
% [1] Jackson, J.E., A User's Guide to Principal Components,
% Wiley, 1988.
% [2] Jolliffe, I.T. Principal Component Analysis, 2nd ed.,
% Springer, 2002.
% [3] Krzanowski, W.J., Principles of Multivariate Analysis,
% Oxford University Press, 1988.
% [4] Seber, G.A.F., Multivariate Observations, Wiley, 1984.
% Copyright
1993-2009 The MathWorks, Inc.
% $Revision:
2.9.2.11 $ $Date: 2009/01/30 14:50:55
$
% When X has more variables than
observations, the default behavior is to
% return all the pc's, even those that
have zero variance. When econFlag
% is 'econ', those will not be
returned.
if nargin < 2, econFlag
= 0; end
[n,p] = size(x);
if isempty(x)
pOrZero = ~isequal(econFlag, 'econ') * p;
coeff = zeros(p,pOrZero); coeff(1:p+1:end) = 1;
score = zeros(n,pOrZero);
latent = zeros(pOrZero,1);
tsquare = zeros(n,1);
return
end
% Center X by subtracting off column
means
x0 = bsxfun(@minus,x,mean(x,1));
if nargout < 2
% The principal component coefficients are the eigenvectors of
% S = X0'*X0./(n-1), but computed using SVD.
if n
>= p &&
(isequal(econFlag,0) || isequal(econFlag,'econ'))
[coeff,ignore] = eig(x0'*x0);
coeff = fliplr(coeff);
else
[ignore,ignore,coeff] = svd(x0,econFlag);
end
% When econFlag is 'econ', only (n-1) components should be
returned.
% See comment below.
if (n <= p) &&
isequal(econFlag, 'econ')
coeff(:,n) = [];
end
else
r = min(n-1,p); % max possible rank of X0
% The principal component coefficients are the eigenvectors of
% S = X0'*X0./(n-1), but computed using SVD.
[U,sigma,coeff] = svd(x0,econFlag); % put in 1/sqrt(n-1) later
% Project X0 onto the principal component axes to get the
scores.
if n == 1 % sigma might have only 1 row
sigma = sigma(1);
else
sigma = diag(sigma);
end
score = bsxfun(@times,U,sigma'); % == x0*coeff
sigma = sigma ./ sqrt(n-1);
% When X has at least as many variables as observations,
eigenvalues
% n:p of S are exactly zero.
if n <= p
% When econFlag is 'econ', nothing corresponding to the zero
% eigenvalues should be returned. svd(,'econ') won't have
% returned anything corresponding to components (n+1):p, so we
% just have to cut off the n-th component.
if isequal(econFlag, 'econ')
sigma(n,:) = []; % make sure this shrinks as a column
coeff(:,n) = [];
score(:,n) = [];
% Otherwise, set those eigenvalues and the corresponding scores
to
% exactly zero. svd(,0) won't have
returned columns of U
% corresponding to components (n+1):p, need to fill those out.
else
sigma(n:p,1) = 0; % make sure this extends as a column
score(:,n:p) = 0;
end
end
% The variances of the pc's are the eigenvalues of S =
X0'*X0./(n-1).
latent = sigma.^2;
% Hotelling's T-squared statistic is the sum of squares of the
% standardized scores, i.e., Mahalanobis
distances. When X appears to
% have column rank < r, ignore components that are
orthogonal to the
% data.
if nargout == 4
if n > 1
q = sum(sigma > max(n,p).*eps(sigma(1)));
if q < r
warning('stats:princomp:colRankDefX', ...
['Columns of X are linearly dependent to within machine
precision.n' ...
'Using only the first %d components to compute
TSQUARED.'],q);
end
else
q = 0;
end
tsquare = (n-1) .* sum(U(:,1:q).^2,2); % ==
sum((score*diag(1./sigma)).^2,2)
end
end
%% test for princomp(Principal
Component Analysis)
% 关于matlab中princomp的使用说明讲解小例子 by
faruto
%
能看懂本程序及相关注释讲解的前提是您对PCA有一定的了解~O(∩_∩)O
% 2009.10.27
clear;
clc
%% load cities data
load cities
% whos
% Name Size Bytes Class
% categories 9x14 252 char
array
% names 329x43 28294 char
array
% ratings 329x9 23688 double array
%% box plot for ratings data
% To get a quick impression of the
ratings data, make a box plot
figure;
boxplot(ratings,'orientation','horizontal','labels',categories);
grid on;
print -djpeg 1;
%% pre-process
stdr = std(ratings);
sr = ratings./repmat(stdr,329,1);
%% use princomp
[coef,score,latent,t2] =
princomp(sr);
%% 输出参数讲解
% coef:9*9
%
主成分系数:即原始数据线性组合生成主成分数据中每一维数据前面的系数.
% coef的每一列代表一个新生成的主成分的系数.
% 比如你想取出前三个主成分的系数,则如下可实现:pca3 =
coef(:,1:3);
% score:329*9
% 字面理解:主成分得分
% 即原始数据在新生成的主成分空间里的坐标值.
% latent:9*1
% 一个列向量,由sr的协方差矩阵的特征值组成.
% 即 latent =
sort(eig(cov(sr)),'descend');
% 测试如下:
% sort(eig(cov(sr)),'descend') =
% 3.4083
% 1.2140
% 1.1415
% 0.9209
% 0.7533
% 0.6306
% 0.4930
% 0.3180
% 0.1204
% latent =
% 3.4083
% 1.2140
% 1.1415
% 0.9209
% 0.7533
% 0.6306
% 0.4930
% 0.3180
% 0.1204
% t2:329*1
% 一中多元统计距离,记录的是每一个观察量到中心的距离
%% 如何提取主成分,达到降为的目的
% 通过latent,可以知道提取前几个主成分就可以了.
figure;
percent_explained =
100*latent/sum(latent);
pareto(percent_explained);
xlabel('Principal Component');
ylabel('Variance Explained (%)');
print -djpeg 2;
% 图中的线表示的累积变量解释程度.
% 通过看图可以看出前七个主成分可以表示出原始数据的90%.
%
所以在90%的意义下只需提取前七个主成分即可,进而达到主成分提取的目的.
%% Visualizing the Results
% 结果的可视化
figure;
biplot(coef(:,1:2),
'scores',score(:,1:2),...
'varlabels',categories);
axis([-.26 1 -.51 .51]);
print -djpeg 3;
% 横坐标和纵坐标分别表示第一主成分和第二主成分
% 红色的点代表329个观察量,其坐标就是那个score
%
蓝色的向量的方向和长度表示了每个原始变量对新的主成分的贡献,其坐标就是那个coef.
==============本帖子的pdf版本
[hide][/hide]
===================
引用:
回复 8# 5342245 的帖子 设原始数据为X,先不做任何预处理。
[coef,score,latent,t2] =
princomp(X);
则那些参数的底层算法大体过程如下:
x0 = bsxfun(@minus,X,mean(X,1));
%x0为将X去均值后的数据。
[coef,ignore] = eig(x0'*x0);
这就是coef的由来。 【当然最终的还有排序什么乱七八糟的。。】
scroe = x0*coef %
这就是score的由来,就是一个简单的线性变换,将原来的X的坐标转换到主成分空间中的坐标。仅此而已
则模型为从原始数据出发:
score =
bsxfun(@minus,X,mean(X,1))*coef;
逆变换:
X =
bsxfun(@plus,score*inv(coef),mean(X,1))
以上这些你可以自己验证,看是否正确。
关于你的第三问。对于每一个主成分,就看coef的相应的列就能知道原始的变量那个对该主成分贡献大了啊。。
上面是没有预处理的。如果加了可逆的预处理。则原始数据亦可从预处理后的数据表示出。进而 bla bla....
===============这回够通俗易懂吧。。O(∩_∩)O
PS:pca算法流程,你熟悉吗?只要知道那个算法过程。这些都不难理解啊。。
建议您看看书把pca算法流程再过一遍。。否则别人再怎么说也没用。。。