文件目录
data5用于K-means
data2用于谱聚类
K-means 实现
算法步骤:
step1:初始化聚类中心
step2:计算每个样本点与聚类中心的距离,将其归为最近的一类
step3:计算聚类之后的每个类别的均值作为新的类别中心
step4:迭代进行 step2,step3,直到类别中心不再发生变化为止
step5:计算准确率
代码
import numpy as np
import matplotlib.pyplot as plt
def draw(data,c):
color = ['r', 'b', 'k', 'g', 'm']
for i in range(c):
plt.scatter(data[data[:, 2] == i + 1, 0],\
data[data[:, 2] == i + 1, 1],c=color[i])
plt.show()
def distance(x1,x2):
return np.sqrt(np.sum(np.power(x2-x1,2)))
def kmeans(data,c):
row,clm=data.shape
u = []
d = np.zeros(data.shape[0])
d[:] = 1000
Y = data.copy()
for i in range(c):
#u.append(data[np.random.randint(0, high=row, size=1),0:2])
#u.append(data[i,0:2])
u.append(data[i*(row//c),0:clm-1])
u = np.array(u)
u_ = np.zeros(u.shape)
while np.sum(np.abs(u-u_)) != 0:
for i in range(data.shape[0]):
for j in range(u.shape[0]):
if distance(data[i, 0:clm-1], u[j]) < d[i]:
Y[i, -1] = j+1
d[i] = distance(data[i, 0:clm-1], u[j])
u_ = u.copy()
for j in range(c):
u[j] = np.mean(Y[Y[:, -1] == j+1,0:clm-1], axis=0)
return u,Y
def acc(data,Y,c):
num_c = int(data.shape[0] / c)
num = np.zeros([c,c])
for i in range(c):
for j in range(c):
num[i,j] = np.sum(Y[num_c*i:num_c*i+num_c, -1] == j+1)
count = np.sum(np.max(num,axis=1))
return count/data.shape[0]
def dis(data,u,c):
u_ = np.zeros(u.shape)
num = np.zeros([c, c])
for j in range(c):
u_[j] = np.mean(data[data[:, -1] == j+1,0:2], axis=0)
for i in range(c):
for j in range(c):
num[i,j] = distance(u[i],u_[j])
count = np.min(num,axis=1)
return count
if __name__ == '__main__':
data = np.loadtxt("data5.txt")
draw(data, 5)
u, Y = kmeans(data, 5)
draw(Y, 5)
ac = acc(data, Y, 5)
print("类别中心:", u)
print('均值距离:', dis(data,u,5))
print("精度:", ac)
测试
谱聚类
原理
代码
import numpy as np
import matplotlib.pyplot as plt
import Kmeans
def draw1(data,Y,c):
color = ['r', 'b', 'k', 'g', 'm']
for i in range(c):
plt.scatter(data[Y[:, -1] == i + 1, 0],\
data[Y[:, -1] == i + 1, 1],c=color[i])
plt.show()
def graph(data,k,sigma,c):
g = np.zeros([data.shape[0],data.shape[0]])
for i in range(data.shape[0]):
for j in range(data.shape[0]):
g[i,j] = np.exp(-np.sum((data[i]-data[j]) ** 2)/(2*(sigma**2)))
if i==j:
g[i,j]=0
g_k = np.zeros(g.shape)
for i in range(k):
for j in range(data.shape[0]):
g_k[j,np.argmax(g[j])] = np.max(g[j])
g_k[np.argmax(g[j]),j] = g_k[j,np.argmax(g[j])]
g[j,np.argmax(g[j])] = 0
g_k = (g_k.T+g_k)/2
d_k = np.sum(g_k,axis=1)
L = np.diag(d_k) - g_k
sq = np.diag(1.0/(d_k ** 0.5))
L = np.dot(np.dot(sq,L),sq)
A, B = np.linalg.eig(L)
feature = B[:, np.argsort(A)[:c]].copy()
data_f =np.c_[feature,np.zeros(feature.shape[0])]
u, Y = Kmeans.kmeans(data_f,2)
return u, Y
if __name__ == '__main__':
data = np.loadtxt("data2.txt")
# data = data[1:5,:]
c = np.zeros(data.shape[0])
ac = []
c[0:100] = 1
c[100:200] = 2
data = np.c_[data,c]
Kmeans.draw(data,2)
for k in range(1,10):
u, Y = graph(data,k,2,2)
ac.append(Kmeans.acc(data, Y, 2))
plt.xlabel("K") # x轴上的名字
plt.ylabel("ACC") # y轴上的名字
plt.title('sigma=2')
plt.plot(range(1,10),ac)
plt.show()
ac1 = []
for i,sigma in enumerate(np.arange(0.5,4,0.5)):
print(sigma)
u, Y = graph(data,4,sigma,2)
ac1.append(Kmeans.acc(data, Y, 2))
plt.xlabel("sigma") # x轴上的名字
plt.ylabel("ACC") # y轴上的名字
plt.title('K=4')
plt.plot(np.arange(0.5,4,0.5),ac1)
plt.show()