以下代码以最简单的二维向量演示了k-means的处理流程
有一堆二维向量<x1,y1>,<x2,y2>..<xn,yn>,聚成两个类
聚类中心就是类中所有向量的均值<xm,ym>, xm = (x1+x2+..+xn)/2, ym = (y1+y2+..+yn)/2
每次聚类都重新计算各个向量和聚类中心的距离(用欧氏距离),将各个向量分配到离各自最近的类
然后比较本次的聚类中心是否与上次相等,如果相等,即表示所有类已经平衡,聚类结束.
![](https://i-blog.csdnimg.cn/blog_migrate/8f900a89c6347c561fdf2122f13be562.gif)
![ExpandedBlockStart.gif](https://i-blog.csdnimg.cn/blog_migrate/961ddebeb323a10fe0623af514929fc1.gif)
using
System;
using System.Collections.Generic;
using System.Text;
using System.Drawing;
namespace Clustering.Demo
{
/// <summary>
/// 利用点坐标(二维向量)来演示k-means算法
/// </summary>
class Kmeans
{
/// <summary>
/// 点的坐标数组(coordinates)
/// </summary>
private static readonly int [,] coords = new int [,]
{
{ 0 , 0 }, { 1 , 0 }, { 0 , 1 }, { 1 , 1 }, { 2 , 1 }, { 1 , 2 }, { 2 , 2 }, { 3 , 2 }, { 6 , 6 }, { 7 , 6 },
{ 8 , 6 }, { 6 , 7 }, { 7 , 7 }, { 8 , 7 }, { 9 , 7 }, { 7 , 8 }, { 8 , 8 }, { 9 , 8 }, { 8 , 9 }, { 9 , 9 },
// { 12, 12 }, { 12, 13 }, { 13, 14 }, { 14, 14 }, { 14, 15 }, { 15, 16 }, { 16, 14 }, { 17, 15 }, { 16, 18 }, { 16, 17 },
};
/// <summary>
/// 聚类的数量
/// </summary>
private static readonly int k = 2 ;
/// <summary>
///
/// </summary>
public static void Cluster()
{
Point[] points = GetPoints(); // 获取原始数据点
PointF[] means = new PointF[k]; // 当前的聚类中心
PointF[] prevMeans = new PointF[k]; // 之前的聚类中心
int [] pointAssigns = new int [points.Length]; // 每个点所属的类
InitMeans(points, k, means); // 随机选k个点作为聚类中心
PrintInit(points, k, means); // 打印初始数据
int iter = 0 ; // 迭代次数(iteration times)
while ( true )
{
Classify(points, k, means, pointAssigns); // 将每个点与各个聚类中心对比进行归类划分
int conv = 0 ; // 收敛类数(convergent clusters)
for ( int j = 0 ; j < k; j ++ )
{
means[j] = CalcMeans(points, pointAssigns, j); // 重新计算聚类中心
if (Compare(means[j], prevMeans[j]))
conv ++ ;
else
prevMeans[j] = means[j];
}
if (conv == k) // 新旧聚类中心一样表示该聚类已收敛至平衡
{
break ;
}
iter ++ ;
PrintIter(points, k, means, pointAssigns, iter); // 打印迭代数据
}
}
/// <summary>
/// 初始化点数组(根据自定义坐标)
/// </summary>
private static Point[] GetPoints()
{
int len = coords.GetLength( 0 );
Point[] points = new Point[len];
for ( int i = 0 ; i < len; i ++ )
{
points[i] = new Point(coords[i, 0 ], coords[i, 1 ]);
}
return points;
}
/// <summary>
/// 随机选k个点作为聚类中心
/// </summary>
private static void InitMeans(Point[] points, int k, PointF[] means)
{
Random random = new Random();
for ( int i = 0 ; i < k; i ++ )
{
means[i] = points[random.Next(points.Length)];
}
}
/// <summary>
/// 将每个点与各个聚类中心对比进行归类划分
/// </summary>
private static void Classify(Point[] points, int k, PointF[] means, int [] pointAssigns)
{
for ( int i = 0 ; i < points.Length; i ++ )
{
double minDist = double .MaxValue; // 最短距离
for ( int j = 0 ; j < k; j ++ )
{
double dist = Distance(points[i], means[j]);
if (dist < minDist)
{
minDist = dist;
pointAssigns[i] = j;
}
}
// Console.WriteLine("{0}归入类{1}", points[i], pointAssigns[i]);
}
}
#region 欧氏距离(Euclidean distance)
/// <summary>
/// 计算欧氏距离(勾股定理的多维空间扩展)
/// </summary>
/// <remarks>
/// 勾股定理: x^2 + y^2 = z^2
/// 欧氏距离: distance = sqrt(x1^2 + x2^2 + ... + xn^2)
/// 欧氏距离的变换式: x1^2 + x2^2 + ... + xn^2 = distance^2
///
/// 二维空间的两点p1(x1,y1)和p(x2,y2),两点间距离dist=sqrt((x1-x2)^2+(y1-y2)^2)
/// </remarks>
private static double Distance(PointF p1, PointF p2)
{
double pow2X = Math.Pow(p1.X - p2.X, 2 );
double pow2Y = Math.Pow(p1.Y - p2.Y, 2 );
return Math.Sqrt(pow2X + pow2Y);
}
#endregion
/// <summary>
/// 计算新的聚类中心(均值)
/// </summary>
/// <remarks>
/// meanX = (x1 + x2 + ... + xn) / n
/// meanY = (y1 + y2 + ... + yn) / n
/// </remarks>
private static PointF CalcMeans(Point[] points, int [] pointAssigns, int j)
{
PointF mean = new PointF();
int n = 0 ;
for ( int i = 0 ; i < points.Length; i ++ )
{
if (pointAssigns[i] == j)
{
mean.X += points[i].X;
mean.Y += points[i].Y;
n ++ ;
}
}
mean.X /= ( float )n;
mean.Y /= ( float )n;
return mean;
}
/// <summary>
/// 比较两个聚类中心是否相等
/// </summary>
private static bool Compare(PointF a, PointF b)
{
if ((( int )(a.X * 10 ) == ( int )(b.X * 10 )) && (( int )(a.Y * 10 ) == ( int )(b.Y * 10 )))
{
return true ;
}
else
{
return false ;
}
}
#region 打印数据(print datas)
/// <summary>
/// 打印初始数据
/// </summary>
private static void PrintInit(Point[] points, int k, PointF[] means)
{
Console.WriteLine( " 总共{0}个样本: " , points.Length);
for ( int i = 0 ; i < points.Length; i ++ )
{
Console.WriteLine( " {0}, {1} " , points[i].X, points[i].Y);
}
Console.WriteLine( " \n初始化时随机选取k个样本作为聚类中心: " );
for ( int i = 0 ; i < k; i ++ )
{
Console.WriteLine( " 聚类{0}的中心: {1}, {2} " , i, means[i].X, means[i].Y);
}
}
/// <summary>
/// 打印迭代数据
/// </summary>
private static void PrintIter(Point[] points, int k, PointF[] means, int [] pointAssigns, int iter)
{
Console.WriteLine( " \n\n--------第{0}次迭代的结果-------- " , iter);
for ( int j = 0 ; j < k; j ++ )
{
Console.WriteLine( " \n第{0}个类的成员: " , j);
for ( int i = 0 ; i < points.Length; i ++ )
{
if (pointAssigns[i] == j)
{
Console.WriteLine( " {0}, {1} " , points[i].X, points[i].Y);
}
}
Console.WriteLine( " 均值(聚类中心): {0}, {1} " , means[j].X, means[j].Y);
}
}
#endregion
}
}
using System.Collections.Generic;
using System.Text;
using System.Drawing;
namespace Clustering.Demo
{
/// <summary>
/// 利用点坐标(二维向量)来演示k-means算法
/// </summary>
class Kmeans
{
/// <summary>
/// 点的坐标数组(coordinates)
/// </summary>
private static readonly int [,] coords = new int [,]
{
{ 0 , 0 }, { 1 , 0 }, { 0 , 1 }, { 1 , 1 }, { 2 , 1 }, { 1 , 2 }, { 2 , 2 }, { 3 , 2 }, { 6 , 6 }, { 7 , 6 },
{ 8 , 6 }, { 6 , 7 }, { 7 , 7 }, { 8 , 7 }, { 9 , 7 }, { 7 , 8 }, { 8 , 8 }, { 9 , 8 }, { 8 , 9 }, { 9 , 9 },
// { 12, 12 }, { 12, 13 }, { 13, 14 }, { 14, 14 }, { 14, 15 }, { 15, 16 }, { 16, 14 }, { 17, 15 }, { 16, 18 }, { 16, 17 },
};
/// <summary>
/// 聚类的数量
/// </summary>
private static readonly int k = 2 ;
/// <summary>
///
/// </summary>
public static void Cluster()
{
Point[] points = GetPoints(); // 获取原始数据点
PointF[] means = new PointF[k]; // 当前的聚类中心
PointF[] prevMeans = new PointF[k]; // 之前的聚类中心
int [] pointAssigns = new int [points.Length]; // 每个点所属的类
InitMeans(points, k, means); // 随机选k个点作为聚类中心
PrintInit(points, k, means); // 打印初始数据
int iter = 0 ; // 迭代次数(iteration times)
while ( true )
{
Classify(points, k, means, pointAssigns); // 将每个点与各个聚类中心对比进行归类划分
int conv = 0 ; // 收敛类数(convergent clusters)
for ( int j = 0 ; j < k; j ++ )
{
means[j] = CalcMeans(points, pointAssigns, j); // 重新计算聚类中心
if (Compare(means[j], prevMeans[j]))
conv ++ ;
else
prevMeans[j] = means[j];
}
if (conv == k) // 新旧聚类中心一样表示该聚类已收敛至平衡
{
break ;
}
iter ++ ;
PrintIter(points, k, means, pointAssigns, iter); // 打印迭代数据
}
}
/// <summary>
/// 初始化点数组(根据自定义坐标)
/// </summary>
private static Point[] GetPoints()
{
int len = coords.GetLength( 0 );
Point[] points = new Point[len];
for ( int i = 0 ; i < len; i ++ )
{
points[i] = new Point(coords[i, 0 ], coords[i, 1 ]);
}
return points;
}
/// <summary>
/// 随机选k个点作为聚类中心
/// </summary>
private static void InitMeans(Point[] points, int k, PointF[] means)
{
Random random = new Random();
for ( int i = 0 ; i < k; i ++ )
{
means[i] = points[random.Next(points.Length)];
}
}
/// <summary>
/// 将每个点与各个聚类中心对比进行归类划分
/// </summary>
private static void Classify(Point[] points, int k, PointF[] means, int [] pointAssigns)
{
for ( int i = 0 ; i < points.Length; i ++ )
{
double minDist = double .MaxValue; // 最短距离
for ( int j = 0 ; j < k; j ++ )
{
double dist = Distance(points[i], means[j]);
if (dist < minDist)
{
minDist = dist;
pointAssigns[i] = j;
}
}
// Console.WriteLine("{0}归入类{1}", points[i], pointAssigns[i]);
}
}
#region 欧氏距离(Euclidean distance)
/// <summary>
/// 计算欧氏距离(勾股定理的多维空间扩展)
/// </summary>
/// <remarks>
/// 勾股定理: x^2 + y^2 = z^2
/// 欧氏距离: distance = sqrt(x1^2 + x2^2 + ... + xn^2)
/// 欧氏距离的变换式: x1^2 + x2^2 + ... + xn^2 = distance^2
///
/// 二维空间的两点p1(x1,y1)和p(x2,y2),两点间距离dist=sqrt((x1-x2)^2+(y1-y2)^2)
/// </remarks>
private static double Distance(PointF p1, PointF p2)
{
double pow2X = Math.Pow(p1.X - p2.X, 2 );
double pow2Y = Math.Pow(p1.Y - p2.Y, 2 );
return Math.Sqrt(pow2X + pow2Y);
}
#endregion
/// <summary>
/// 计算新的聚类中心(均值)
/// </summary>
/// <remarks>
/// meanX = (x1 + x2 + ... + xn) / n
/// meanY = (y1 + y2 + ... + yn) / n
/// </remarks>
private static PointF CalcMeans(Point[] points, int [] pointAssigns, int j)
{
PointF mean = new PointF();
int n = 0 ;
for ( int i = 0 ; i < points.Length; i ++ )
{
if (pointAssigns[i] == j)
{
mean.X += points[i].X;
mean.Y += points[i].Y;
n ++ ;
}
}
mean.X /= ( float )n;
mean.Y /= ( float )n;
return mean;
}
/// <summary>
/// 比较两个聚类中心是否相等
/// </summary>
private static bool Compare(PointF a, PointF b)
{
if ((( int )(a.X * 10 ) == ( int )(b.X * 10 )) && (( int )(a.Y * 10 ) == ( int )(b.Y * 10 )))
{
return true ;
}
else
{
return false ;
}
}
#region 打印数据(print datas)
/// <summary>
/// 打印初始数据
/// </summary>
private static void PrintInit(Point[] points, int k, PointF[] means)
{
Console.WriteLine( " 总共{0}个样本: " , points.Length);
for ( int i = 0 ; i < points.Length; i ++ )
{
Console.WriteLine( " {0}, {1} " , points[i].X, points[i].Y);
}
Console.WriteLine( " \n初始化时随机选取k个样本作为聚类中心: " );
for ( int i = 0 ; i < k; i ++ )
{
Console.WriteLine( " 聚类{0}的中心: {1}, {2} " , i, means[i].X, means[i].Y);
}
}
/// <summary>
/// 打印迭代数据
/// </summary>
private static void PrintIter(Point[] points, int k, PointF[] means, int [] pointAssigns, int iter)
{
Console.WriteLine( " \n\n--------第{0}次迭代的结果-------- " , iter);
for ( int j = 0 ; j < k; j ++ )
{
Console.WriteLine( " \n第{0}个类的成员: " , j);
for ( int i = 0 ; i < points.Length; i ++ )
{
if (pointAssigns[i] == j)
{
Console.WriteLine( " {0}, {1} " , points[i].X, points[i].Y);
}
}
Console.WriteLine( " 均值(聚类中心): {0}, {1} " , means[j].X, means[j].Y);
}
}
#endregion
}
}