function [idx, C, sumD, D] = kmeans(X, k, varargin)
% varargin:实际输入参量
if nargin 1
% 大于1刚至少有一种距离
error(sprintf('Ambiguous ''distance'' parameter value: %s.', distance));
elseif isempty(i)
% 如果是空的,则表明没有合适的距离
error(sprintf('Unknown ''distance'' parameter value: %s.', distance));
end
% 针对不同的距离,处理不同
distance = distNames{i};
switch distance
case 'cityblock'
% sort 列元素按升序排列,Xord中存的是元素在原始矩阵中的列中对应的大小位置
[Xsort,Xord] = sort(X,1);
case 'cosine'
% 余弦
% 计算每一行的和的平方根
Xnorm = sqrt(sum(X.^2, 2));
if any(min(Xnorm) <= eps * max(Xnorm))
error(['Some points have small relative magnitudes, making them ', ...
'effectively zero.\nEither remove those points, or choose a ', ...
'distance other than ''cosine''.'], []);
end
% 标量化 Xnorm(:,ones(1,p))得到n*p的矩阵
X = X ./ Xnorm(:,ones(1,p));
case 'correlation'
% 线性化
X = X - repmat(mean(X,2),1,p);
% 计算每一行的和的平方根
Xnorm = sqrt(sum(X.^2, 2));
if any(min(Xnorm) 1
error(sprintf('Ambiguous ''start'' parameter value: %s.', start));
elseif isempty(i)
error(sprintf('Unknown ''start'' parameter value: %s.', start));
elseif isempty(k)
error('You must specify the number of clusters, K.');
end
start = startNames{i};
% strcmp比较两个字符串 uniform是在X中随机选择K个点
if strcmp(start, 'uniform')
if strcmp(distance, 'hamming')
error('Hamming distance cannot be initialized with uniform random values.');
end
% 标量化后的X
Xmins = min(X,1);
Xmaxs = max(X,1);
end
elseif isnumeric(start) % 判断输入是否是一个数 这里的start是一个K*P的矩阵,表示初始聚类中心
CC = start; % CC表示初始聚类中心
start = 'numeric';
if isempty(k)
k = size(CC,1);
elseif k ~= size(CC,1);
error('The ''start'' matrix must have K rows.');
elseif size(CC,2) ~= p
error('The ''start'' matrix must have the same number of columns as X.');
end
if isempty(reps)
reps = size(CC,3);
elseif reps ~= size(CC,3);
error('The third dimension of the ''start'' array must match the ''replicates'' parameter value.');
end
% Need to center explicit starting points for 'correlation'.
% 线性距离需要指定具体的开始点
% (Re)normalization for 'cosine'/'correlation' is done at each
% iteration.每一次迭代进行“余弦和线性”距离正规化
% 判断是否相等
if isequal(distance, 'correlation')
% repmat复制矩阵1*P*1 线性化
CC = CC - repmat(mean(CC,2),[1,p,1]);
end
else
error('The ''start'' parameter value must be a string or a numeric matrix or array.');
end
% ------------------------------------------------------------------
% 如果一个聚类丢失了所有成员,这个进程将被调用
if ischar(emptyact)
emptyactNames = {'error','drop','singleton'};
i = strmatch(lower(emptyact), emptyactNames);
if length(i) > 1
error(sprintf('Ambiguous ''emptyaction'' parameter value: %s.', emptyact));
elseif isempty(i)
error(sprintf('Unknown ''emptyaction'' parameter value: %s.', emptyact));
end
emptyact = emptyactNames{i};
else
error('The ''emptyaction'' parameter value must be a string.');
end
% ------------------------------------------------------------------
% 控制输出的显示示信息
if ischar(display)
% strvcat 垂直连接字符串
i = strmatch(lower(display), strvcat('off','notify','final','iter'));
if length(i) > 1
error(sprintf('Ambiguous ''display'' parameter value: %s.', display));
elseif isempty(i)
error(sprintf('Unknown ''display'' parameter value: %s.', display));
end
display = i-1;
else
error('The ''display'' parameter value must be a string.');
end
% ------------------------------------------------------------------
% 输入参数K的控制
if k == 1
error('The number of clusters must be greater than 1.');
elseif n 2 % 'iter',\t 表示向后空出8个字符
disp(sprintf(' iter\t phase\t num\t sum'));
end
% ------------------------------------------------------------------
% Begin phase one: batch reassignments 第一队段:分批赋值
converged = false;
iter = 0;
while true
% Compute the distance from every point to each cluster centroid
% 计算每一个点到每一个聚类中心的距离,D中存放的是N*K个数值
D(:,changed) = distfun(X, C(changed,:), distance, iter);
% Compute the total sum of distances for the current configuration.
% Can't do it first time through, there's no configuration yet.
% 计算当前配置的总距离,但第一次不能执行,因为还没有配置
if iter > 0
totsumD = sum(D((idx-1)*n + (1:n)'));
% Test for a cycle: if objective is not decreased, back out
% the last step and move on to the single update phase
% 循环测试:如果目标没有减少,退出到最后一步,移动到第二阶段
% prevtotsumD表示前一次的总距离,如果前一次的距离比当前的小,则聚类为上一次的,即不发生变化
if prevtotsumD 2 % 'iter'
disp(sprintf(dispfmt,iter,1,length(moved),totsumD));
end
if iter >= maxit,
% break(2)
break; % 跳出while
end
end
% Determine closest cluster for each point and reassign points to clusters
% 决定每一个点的最近分簇,重新分配点到各个簇
previdx = idx; % 大小为n*1
% totsumD 被初始化为无穷大,这里表示总距离
prevtotsumD = totsumD;
% 返回每一行中最小的元素,d的大小为n*1,nidx为最小元素在行中的位置,其大小为n*1,D为n*p
[d, nidx] = min(D, [], 2);
if iter == 0
% iter==0,表示第一次迭代
% Every point moved, every cluster will need an update
% 每一个点需要移动,每一个簇更新
moved = 1:n;
idx = nidx;
changed = 1:k;
else
% Determine which points moved 决定哪一个点移动
% 找到上一次和当前最小元素不同的位置
moved = find(nidx ~= previdx);
if length(moved) > 0
% Resolve ties in favor of not moving
% 重新分配而不是移动 括号中是一个逻辑运算 确定需要移动点的位置
moved = moved(D((previdx(moved)-1)*n + moved) > d(moved));
end
% 如果没有不同的,即当前的是最小元素,跳出循环,得到的元素已经是各行的最小值
if length(moved) == 0
% break(3)
break;
end
idx(moved) = nidx(moved);
% Find clusters that gained or lost members 找到获得的或者丢失的成员的分簇
% 得到idx(moved)和previdx(moved)中不重复出现的所有元素,并按升序排列
changed = unique([idx(moved); previdx(moved)])';
end
% Calculate the new cluster centroids and counts. 计算新的分簇中心和计数
% C(changed,:)表示新的聚类中心,m(changed)表示聚类标号在idx中出现的次数
% sort 列元素按升序排列,Xsort存放对的元素,Xord中存的是元素在原始矩阵中的列中对应的大小位置
[C(changed,:), m(changed)] = gcentroids(X, idx, changed, distance, Xsort, Xord);
iter = iter + 1;
% Deal with clusters that have just lost all their members
% 处理丢失所有成员的分簇,empties表示丢失元素的聚类标号 不用考虑
empties = changed(m(changed) == 0);
if ~isempty(empties)
switch emptyact
case 'error'
% 默认值,一般情况下不会出现
error(sprintf('Empty cluster created at iteration %d.',iter));
case 'drop'
% Remove the empty cluster from any further processing
% 移走空的聚类
D(:,empties) = NaN;
changed = changed(m(changed) > 0);
if display > 0
warning(sprintf('Empty cluster created at iteration %d.',iter));
end
case 'singleton'
if display > 0
warning(sprintf('Empty cluster created at iteration %d.',iter));
end
for i = empties
% Find the point furthest away from its current cluster.
% Take that point out of its cluster and use it to create
% a new singleton cluster to replace the empty one.
% 找到离聚类中心最远距离的点,把该点从它的聚类中移走,用它来创建一个新的聚类,来代替空的聚类
% 得到列的最大元素(dlarge),以及该元素在列中的标号(lonely)
[dlarge, lonely] = max(d);
from = idx(lonely); % taking from this cluster 从当前聚类移走
C(i,:) = X(lonely,:);
m(i) = 1;
idx(lonely) = i;
d(lonely) = 0;
% Update clusters from which points are taken
% 更新那些点被移走的聚类
[C(from,:), m(from)] = gcentroids(X, idx, from, distance, Xsort, Xord);
changed = unique([changed from]);
end
end
end
end % phase one
% ------------------------------------------------------------------
% Initialize some cluster information prior to phase two
% 为第二阶段初始化一些先验聚类信息 针对特定的距离,默认的是欧氏距离
switch distance
case 'cityblock'
Xmid = zeros([k,p,2]);
for i = 1:k
if m(i) > 0
% Separate out sorted coords for points in i'th cluster,
% and save values above and below median, component-wise
% 分解出第i个聚类中挑选的点的坐标,保存它的上,下中位数
% reshape把矩阵分解为要求的行列数m*p
% sort 列元素按升序排列,Xord中存的是元素在原始矩阵中的列中对应的大小位置
Xsorted = reshape(Xsort(idx(Xord)==i), m(i), p);
% floor取比值小或者等于的最近的值
nn = floor(.5*m(i));
if mod(m(i),2) == 0
Xmid(i,:,1:2) = Xsorted([nn, nn+1],:)';
elseif m(i) > 1
Xmid(i,:,1:2) = Xsorted([nn, nn+2],:)';
else
Xmid(i,:,1:2) = Xsorted([1, 1],:)';
end
end
end
case 'hamming'
Xsum = zeros(k,p);
for i = 1:k
if m(i) > 0
% Sum coords for points in i'th cluster, component-wise
% 对基于分量对第i个聚类的坐标点求和
Xsum(i,:) = sum(X(idx==i,:), 1);
end
end
end
% ------------------------------------------------------------------
% Begin phase two: single reassignments 第二阶段:单独赋值
% m中保存的是每一个聚类的个数,元素和为n
% find(m' > 0)得到m'中大于0的元素的位置(索引)
% 实际情况(默认情况下)changed=1:k
changed = find(m' > 0);
lastmoved = 0;
nummoved = 0;
iter1 = iter;
while iter < maxit
% Calculate distances to each cluster from each point, and the
% potential change in total sum of errors for adding or removing
% each point from each cluster. Clusters that have not changed
% membership need not be updated.
% 计算从每一个点到每一个聚类的距离,潜在由于总距离发生变化移除或添加引起的错误。
% 那些成员没有改变的聚类不需要更新。
%
% Singleton clusters are a special case for the sum of dists
% calculation. Removing their only point is never best, so the
% reassignment criterion had better guarantee that a singleton
% point will stay in its own cluster. Happily, we get
% Del(i,idx(i)) == 0 automatically for them.
% 单独聚类在计算距离时是一个特殊情况,仅仅移除点不是最好的方法,因此重新赋值准则能够保证一个
% 单独的点能够留在它的聚类中,可喜的是,自动有Del(i,idx(i)) == 0。
switch distance
case 'sqeuclidean'
for i = changed
% idx存放的距离矩阵D中每一行的最小元素所处的列,也即位置
mbrs = (idx == i);
sgn = 1 - 2*mbrs; % -1 for members, 1 for nonmembers
% 表示只有一个聚类
if m(i) == 1
% prevent divide-by-zero for singleton mbrs 防止单个成员做0处理
sgn(mbrs) = 0;
end
Del(:,i) = (m(i) ./ (m(i) + sgn)) .* sum((X - C(repmat(i,n,1),:)).^2, 2);
end
case 'cityblock'
for i = changed
if mod(m(i),2) == 0 % this will never catch singleton clusters
ldist = Xmid(repmat(i,n,1),:,1) - X;
rdist = X - Xmid(repmat(i,n,1),:,2);
mbrs = (idx == i);
sgn = repmat(1-2*mbrs, 1, p); % -1 for members, 1 for nonmembers
Del(:,i) = sum(max(0, max(sgn.*rdist, sgn.*ldist)), 2);
else
Del(:,i) = sum(abs(X - C(repmat(i,n,1),:)), 2);
end
end
case {'cosine','correlation'}
% The points are normalized, centroids are not, so normalize them
normC(changed) = sqrt(sum(C(changed,:).^2, 2));
if any(normC 0
% Resolve ties in favor of not moving
% 重新分配而不是移动 确定移动的位置
moved = moved(Del((previdx(moved)-1)*n + moved) > minDel(moved));
end
if length(moved) == 0
% Count an iteration if phase 2 did nothing at all, or if we're
% in the middle of a pass through all the points
if (iter - iter1) == 0 | nummoved > 0
iter = iter + 1;
if display > 2 % 'iter'
disp(sprintf(dispfmt,iter,2,nummoved,totsumD));
end
end
converged = true;
break;
end
% Pick the next move in cyclic order
moved = mod(min(mod(moved - lastmoved - 1, n) + lastmoved), n) + 1;
% If we've gone once through all the points, that's an iteration
if moved 2 % 'iter'
disp(sprintf(dispfmt,iter,2,nummoved,totsumD));
end
if iter >= maxit, break; end
nummoved = 0;
end
nummoved = nummoved + 1;
lastmoved = moved;
oidx = idx(moved);
nidx = nidx(moved);
totsumD = totsumD + Del(moved,nidx) - Del(moved,oidx);
% Update the cluster index vector, and rhe old and new cluster
% counts and centroids
idx(moved) = nidx;
m(nidx) = m(nidx) + 1;
m(oidx) = m(oidx) - 1;
switch distance
case 'sqeuclidean'
C(nidx,:) = C(nidx,:) + (X(moved,:) - C(nidx,:)) / m(nidx);
C(oidx,:) = C(oidx,:) - (X(moved,:) - C(oidx,:)) / m(oidx);
case 'cityblock'
for i = [oidx nidx]
% Separate out sorted coords for points in each cluster.
% New centroid is the coord median, save values above and
% below median. All done component-wise.
Xsorted = reshape(Xsort(idx(Xord)==i), m(i), p);
nn = floor(.5*m(i));
if mod(m(i),2) == 0
C(i,:) = .5 * (Xsorted(nn,:) + Xsorted(nn+1,:));
Xmid(i,:,1:2) = Xsorted([nn, nn+1],:)';
else
C(i,:) = Xsorted(nn+1,:);
if m(i) > 1
Xmid(i,:,1:2) = Xsorted([nn, nn+2],:)';
else
Xmid(i,:,1:2) = Xsorted([1, 1],:)';
end
end
end
case {'cosine','correlation'}
C(nidx,:) = C(nidx,:) + (X(moved,:) - C(nidx,:)) / m(nidx);
C(oidx,:) = C(oidx,:) - (X(moved,:) - C(oidx,:)) / m(oidx);
case 'hamming'
% Update summed coords for points in each cluster. New
% centroid is the coord median. All done component-wise.
Xsum(nidx,:) = Xsum(nidx,:) + X(moved,:);
Xsum(oidx,:) = Xsum(oidx,:) - X(moved,:);
C(nidx,:) = .5*sign(2*Xsum(nidx,:) - m(nidx)) + .5;
C(oidx,:) = .5*sign(2*Xsum(oidx,:) - m(oidx)) + .5;
end
changed = sort([oidx nidx]);
end % phase two
% ------------------------------------------------------------------
if (~converged) & (display > 0)
warning(sprintf('Failed to converge in %d iterations.', maxit));
end
% Calculate cluster-wise sums of distances
nonempties = find(m(:)'>0);
D(:,nonempties) = distfun(X, C(nonempties,:), distance, iter);
d = D((idx-1)*n + (1:n)');
sumD = zeros(k,1);
for i = 1:k
sumD(i) = sum(d(idx == i));
end
if display > 1 % 'final' or 'iter'
disp(sprintf('%d iterations, total sum of distances = %g',iter,totsumD));
end
% Save the best solution so far
if totsumD 3
Dbest = D;
end
end
end
% Return the best solution