K-means 是一种常用的聚类算法,旨在将数据集划分为 K 个不同的、非重叠的子集(簇),使得每个数据点属于离其最近的簇的中心点所代表的类别。
算法具体步骤如下:
-
初始化:随机选择 K 个数据点作为初始的簇中心(centroid)。
-
分配数据点到最近的簇:对于每个数据点,计算其到 K 个簇中心的距离,将该数据点分配给距离最近的簇。
-
更新簇中心:对于每个簇,计算该簇内所有数据点的平均值,并将该平均值作为新的簇中心。
-
重复步骤2和3:重复步骤2和3,直到簇中心不再变化或变化很小,或者达到预先设定的迭代次数。
-
结束:簇中心稳定或者达到最大迭代次数时,算法结束,得到最终的簇分配结果。
Python代码(Iris数据集)
import pandas as pd
import numpy as np
import random
#读取数据
data = pd.read_csv('./Iris.csv').values
data = np.array(data)
#截取特征参数及其编号
numbers = data[:, 1]
characters = data[:, 1:-1]
labels = data[:, -1]
len_y = len(characters)
len_x = len(characters[0])
def Distance(characters, len_x, len_y, centers, k):
Distance = []
for data in characters:
diff = np.tile(data, (k,1)) - centers
squaredDiff = diff ** 2
squaredDist = np.sum(squaredDiff, axis=1)
distance = squaredDist ** 0.5
Distance.append(distance)
Distance = np.array(Distance)
return Distance
def Center(characters, centers, len_x, len_y, k):
distance = Distance(characters, len_x, len_y, centers, k)
min_mark = np.argmin(distance, axis=1)
centers = pd.DataFrame(characters).groupby(min_mark).mean()
centers = centers.values
centers = np.array(centers)
return centers
def K_means(numbers, characters, labels, k, len_x, len_y, epochs):
centers = np.zeros([k, len_x])
centers[0] = characters[random.randint(0, 49)]
centers[1] = characters[random.randint(50, 99)]
centers[2] = characters[random.randint(100, 149)]
for j in range(epochs):
centers = Center(characters, centers, len_x, len_y, k)
centers = sorted(centers.tolist())
distance = Distance(characters, len_x, len_y, centers, k)
min_mark = np.argmin(distance, axis=1)
return centers, min_mark, distance
epochs = 500
k = 3
centers, min_mark, distance = K_means(numbers, characters, labels, k, len_x, len_y, epochs)
for i in range(k):
print(f'第{i+1}个族群的中心为{centers[i]}')
for i in range(len_y):
print(f'第{i+1}组数据为{data[i]},属于族群{min_mark[i]+1},距离为{distance[i,min_mark[i]]}\n')
C++代码(Iris数据集)
#include <iostream>
#include <string>
#include <fstream>
#include <iostream>
#include <sstream>
#include <vector>
#include <cstring>
#include <algorithm>
#include <string>
#include <math.h>
#include <stdlib.h>
using namespace std;
struct Flower{
int id;
double character1;
double character2;
double character3;
double character4;
string labels;
int label = 0;
};
struct Distance{
struct Flower information;
int labels;
};
void GenerateCenters(int k, vector<struct Flower> FlowerVector, vector<struct Flower> &random_centers)
{
vector<struct Flower> tmp(FlowerVector);
int random_index;
int n = FlowerVector.size();
random_index = rand() % (50-0) + 0;
//cout << random_index << endl;
random_centers.push_back(FlowerVector.at(random_index));
random_index = rand() % (100-50) + 50;
random_centers.push_back(FlowerVector.at(random_index));
random_index = rand() % (150-100) + 100;
random_centers.push_back(FlowerVector.at(random_index));
}
double GetDistance(Flower p0, Flower p1)
{
return sqrt((p0.character1 - p1.character1)*(p0.character1 - p1.character1) + (p0.character2 - p1.character2)*(p0.character2 - p1.character2) + (p0.character3 - p1.character3)*(p0.character3 - p1.character3) + (p0.character4 - p1.character4)*(p0.character4 - p1.character4));
}
void DoKmeansCluster(vector<struct Flower> FlowerVector, vector<struct Flower> &random_centers, vector<struct Distance> &result)
{
vector<struct Flower> tmp(random_centers);
int point_num = FlowerVector.size();
int k = random_centers.size();
for (int p = 0; p < point_num; p++)
{
float distance = 9999;
Distance res;
for (int q = 0; q < k; q++)
{
float tmp_distance = GetDistance(FlowerVector[p], random_centers[q]);
if (tmp_distance < distance)
{
distance = tmp_distance;
res.labels = q;
res.information = FlowerVector[p];
}
}
result.push_back(res);
}
for (int i = 0; i < k; i++)
{
int count = 0;
double sum_1 = 0;
double sum_2 = 0;
double sum_3 = 0;
double sum_4 = 0;
for (int j = 0; j < point_num; j++)
{
if (result[j].labels == i)
{
count++;
sum_1 += result[j].information.character1;
sum_2 += result[j].information.character2;
sum_3 += result[j].information.character3;
sum_4 += result[j].information.character4;
}
}
random_centers[i].character1 = sum_1 / count;
random_centers[i].character2 = sum_2 / count;
random_centers[i].character3 = sum_3 / count;
random_centers[i].character4 = sum_4 / count;
//cout << "(" << random_centers[i].x << "," << random_centers[i].y << ")" << endl;
}
}
int main()
{
ifstream infile("Iris.csv", ios::in);
string line;
vector<struct Flower> FlowerVector;
getline(infile, line);
while (getline(infile, line))
{
stringstream ss(line);
string str;
Flower flower;
getline(ss, str, ',');
flower.id = stoi(str);
getline(ss, str, ',');
flower.character1 = stod(str);
getline(ss, str, ',');
flower.character2 = stod(str);
getline(ss, str, ',');
flower.character3 = stod(str);
getline(ss, str, ',');
flower.character4 = stod(str);
getline(ss, str, ',');
flower.labels = str;
FlowerVector.push_back(flower);
}
int x = FlowerVector.size();
for (int i = 0; i < x; i++)
{
if (FlowerVector[i].labels == "Iris-setosa")
FlowerVector[i].label = 1;
if (FlowerVector[i].labels == "Iris-versicolor")
FlowerVector[i].label = 2;
if (FlowerVector[i].labels == "Iris-virginica")
FlowerVector[i].label = 3;
}
int k = 3;
int epochs = 500;
vector<Flower> random_centers;
GenerateCenters(k, FlowerVector, random_centers);
vector<struct Distance> result;
for(int i=0;i<epochs;i++)
{
DoKmeansCluster(FlowerVector, random_centers, result);
}
for(int i=0;i<k;i++)
cout << "(" << random_centers[i].character1 << "," << random_centers[i].character2 << "," << random_centers[i].character3 << "," << random_centers[i].character4 << ")" << endl;
for(int i=0;i<150;i++)
cout<<i+1<<','<<result[i].labels+1<<endl;
}