kmeans的算法细节就不再细讲了,比较简单。
第一个应用是对波形数据进行分类,总共有三种波,有21个特征用于分类。总共有5000条数据
import numpy as np
import matplotlib.pyplot as plt
f = open('data/waveform.csv')
context = f.readlines()
x = np.zeros((5000, 21))
y = np.zeros((5000,))
id = 0
for line in context:
line = line.replace('\n', '')
array = line.split(',')
x[id] = array[:-1]
y[id] = array[-1]
id += 1
plt.scatter(x[:, 1], x[:, 3])
plt.show()
print(x, '\n')
print(y)
这段代码将数据列与标签列进行分离
def kmeans(data,k=3):
def _distance(p1, p2):
"""
Return Eclud distance between two points.
p1 = np.array([0,0]), p2 = np.array([1,1]) => 1.414
"""
tmp = np.sum((p1-p2)**2)
return np.sqrt(tmp)
def _rand_center(data,k):
"""Generate k center within the range of data set."""
n = data.shape[1] # features
centroids = np.zeros((k, n)) # init with (0,0)....
for i in range(n):
dmin, dmax = np.min(data[:, i]), np.max(data[:, i])
centroids[:, i] = dmin + (dmax - dmin) * np.random.rand(k)
return centroids
def _converged(centroids1, centroids2):
# if centroids not changed, we say 'converged'
set1 = set([tuple(c) for c in centroids1])
set2 = set([tuple(c) for c in centroids2])
return (set1 == set2)
n = data.shape[0] # number of entries
centroids = _rand_center(data, k)
label = np.zeros(n, dtype=np.int) # track the nearest centroid
assement = np.zeros(n) # for the assement of our model
converged = False
while not converged:
old_centroids = np.copy(centroids)
for i in range(n):
# determine the nearest centroid and track it with label
min_dist, min_index = np.inf, -1
for j in range(k):
dist = _distance(data[i], centroids[j])
if dist < min_dist:
min_dist, min_index = dist, j
label[i] = j
assement[i] = _distance(data[i], centroids[label[i]]) ** 2
# update centroid
for m in range(k):
centroids[m] = np.mean(data[label == m], axis=0)
converged = _converged(old_centroids, centroids)
return centroids, label, np.sum(assement)
best_assement = np.inf
best_centroids = None
best_label = None
这段代码是别人写好的kmeans算法,我直接拿过来用了
data0 = data[best_label==0]
data1 = data[best_label==1]
data2 = data[best_label==2]
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
ax1.scatter(data[:, 0], data[:, 1], c='c', s=30, marker='o')
ax2.scatter(data0[:, 0], data0[:, 1], c='r')
ax2.scatter(data1[:, 0], data1[:, 1], c='c')
ax2.scatter(data2[:, 0], data2[:, 1], c='b')
ax2.scatter(centroids[:, 0], centroids[:, 1], c='b', s=120, marker='o')
plt.show()
这段代码对数据可视化
x0 = x[best_label==0]
x1 = x[best_label==1]
x2 = x[best_label==2]
n0=n1=n2=0
for id in range(5000):
if y[id] == 0:
n0 += 1
if y[id] == 1:
n1 += 1
if y[id] == 2:
n2 += 1
y0 = y[best_label==0]
y1 = y[best_label==1]
y2 = y[best_label==2]
m0=m1=m2=0
for id in range(len(y0)):
if y0[id] == 0:
m0 += 1
for id in range(len(y1)):
if y1[id] == 1:
m1 += 1
for id in range(len(y2)):
if y2[id] == 2:
m2 += 1
print(m0/n0)
print(m1/n1)
print(m2/n2)
因为给定了标签,所以可以统计正确率
由于kmeans和初始点的选择有关系,所以我们进行进行10次然后选取最好的实验结果保存
实验证明正确率为50%,20%,30%