# K-means高维聚类

Automatically determine the best cluster number K in k-means.
• Generate 1000 random N-Dimensional points;
• Try different K number;
• Compute SSE;
• Plot K-SSE figure;
• Choose the best K number (how to choose?).
Try different N number: 2, 3, 5, 10 Write the code in Jupyter Notebook Give me the screenshot of the code and results in Jupyter Notebook

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
N=2                                                #维度

def distance_fun(p1, p2, N):
result=0
for i in range(0,N):
result=result+((p1[i]-p2[i])**2)
return np.sqrt(result)

def mean_fun(a):
return np.mean(a,axis=0)

def farthest(center_arr, arr):
f = [0, 0]
max_d = 0
for e in arr:
d = 0
for i in range(center_arr.__len__()):
d = d + np.sqrt(distance_fun(center_arr[i], e, N))
if d > max_d:
max_d = d
f = e
return f

def closest(a, arr):
c = arr[1]
min_d = distance_fun(a, arr[1])
arr = arr[1:]
for e in arr:
d = distance_fun(a, e)
if d < min_d:
min_d = d
c = e
return c

if __name__=="__main__":

arr = np.random.randint(0,10000, size=(1000, 1, N))[:, 0, :]   #1000个0-10000随机数
'''
block1=  np.random.randint(0,2000, size=(100, 1, N))[:, 0, :]     #分区间生成随机数
block2 = np.random.randint(2000,4000, size=(100, 1, N))[:, 0, :]
block3 = np.random.randint(4000,6000, size=(100, 1, N))[:, 0, :]
block4 = np.random.randint(6000,8000, size=(100, 1, N))[:, 0, :]
block5 = np.random.randint(8000,10000, size=(100, 1, N))[:, 0, :]
arr=np.vstack((block1,block2,block3,block4,block5))
'''

## 初始化聚类中心和聚类容器
K = 5
r = np.random.randint(arr.__len__() - 1)
center_arr = np.array([arr[r]])
cla_arr = [[]]
for i in range(K-1):
k = farthest(center_arr, arr)
center_arr = np.concatenate([center_arr, np.array([k])])
cla_arr.append([])

## 迭代聚类
n = 20
cla_temp = cla_arr
for i in range(n):
for e in arr:
ki = 0
min_d = distance_fun(e, center_arr[ki],N)
for j in range(1, center_arr.__len__()):
if distance_fun(e, center_arr[j],N) < min_d:
min_d = distance_fun(e, center_arr[j],N)
ki = j
cla_temp[ki].append(e)

for k in range(center_arr.__len__()):
if n - 1 == i:
break
center_arr[k] = mean_fun(cla_temp[k])
cla_temp[k] = []

if N>=2:
print(N,'维数据前两维投影')
col = ['gold', 'blue', 'violet', 'cyan', 'red','black','lime','brown','silver']
plt.figure(figsize=(10, 10))
for i in range(K):
plt.scatter(center_arr[i][0], center_arr[i][1], color=col[i])
plt.scatter([e[0] for e in cla_temp[i]], [e[1] for e in cla_temp[i]], color=col[i])
plt.show()

if N>=3:
print(N,'维数据前三维投影')
fig = plt.figure(figsize=(8, 8))
ax = Axes3D(fig)
for i in range(K):
ax.scatter(center_arr[i][0], center_arr[i][1], center_arr[i][2], color=col[i])
ax.scatter([e[0] for e in cla_temp[i]], [e[1] for e in cla_temp[i]],[e[2] for e in cla_temp[i]], color=col[i])
plt.show()

print(N,'维')
for i in range(K):
print('第',i+1,'个聚类中心坐标：')
for j in range(0,N):
print(center_arr[i][j])



K=5时三维的完全随机数结果，可以看到初步分类正确

5个聚类中心都在理想区间内，结果同样正确。

sse[i]=sse[i]+distance_fun(e, center_arr[ki],N)


SSE图像大致如下：

# PCA

• Generate 5,00,000 random points with 200-D
• Dimension reduction to keep 90% energy using PCA
• Report how many dimensions are kept
• Compute k-means (k=100)
• Compare brute force NN and kd-tree, and report their running time
• Python, Jupyter Notebook

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
N=200                                                #维度

def distance_fun(p1, p2, N):
result=0
for i in range(0,N):
result=result+((p1[i]-p2[i])**2)
return np.sqrt(result)

def mean_fun(a):
return np.mean(a,axis=0)

def farthest(center_arr, arr):
f = [0, 0]
max_d = 0
for e in arr:
d = 0
for i in range(center_arr.__len__()):
d = d + np.sqrt(distance_fun(center_arr[i], e, N))
if d > max_d:
max_d = d
f = e
return f

def closest(a, arr):
c = arr[1]
min_d = distance_fun(a, arr[1])
arr = arr[1:]
for e in arr:
d = distance_fun(a, e)
if d < min_d:
min_d = d
c = e
return c

def pca(XMat):
average = mean_fun(XMat)
m, n = np.shape(XMat)
avgs = np.tile(average, (m, 1))
featValue, featVec=  np.linalg.eig(covX)  #求解协方差矩阵的特征值和特征向量
index = np.argsort(-featValue) #依照featValue进行从大到小排序

sumfeatvalue=sum(index)
sumt=0
k=0
while(sumt<0.9*sumfeatvalue):
sumt+=index[k]
k+=1

finalData = []
selectVec = np.matrix(featVec.T[index]) #所以这里须要进行转置
reconData = (finalData * selectVec) + average

return finalData, reconData,k

def plotBestFit(data1, data2):
dataArr1 = np.array(data1)
dataArr2 = np.array(data2)

m = np.shape(dataArr1)[0]
axis_x1 = []
axis_y1 = []
axis_x2 = []
axis_y2 = []
for i in range(m):
axis_x1.append(dataArr1[i,0])
axis_y1.append(dataArr1[i,1])
axis_x2.append(dataArr2[i,0])
axis_y2.append(dataArr2[i,1])
fig = plt.figure(figsize=(10, 10))
#ax.scatter(axis_x1, axis_y1, s=50, c='red', marker='s')
ax.scatter(axis_x2, axis_y2, s=1, c='blue')
plt.show()

if __name__ == "__main__":
'''
arr = np.random.randint(0,10000, size=(1000, 1, N))[:, 0, :]
XMat=arr

'''
block1=  np.random.randint(0,2000, size=(100000, 1, N))[:, 0, :]     #分区间生成随机数
block2 = np.random.randint(2000,4000, size=(100000, 1, N))[:, 0, :]
block3 = np.random.randint(4000,6000, size=(100000, 1, N))[:, 0, :]
block4 = np.random.randint(6000,8000, size=(100000, 1, N))[:, 0, :]
block5 = np.random.randint(8000,10000, size=(100000, 1, N))[:, 0, :]
XMat=np.vstack((block1,block2,block3,block4,block5))

finalData, reconMat,pcaN = pca(XMat)
plotBestFit(finalData, reconMat)   #输出前两维切片检查效果
print('降维到：',pcaN)



02-28 11万+

08-06 6870
07-03 5451
04-16 3万+
03-08
06-09
01-20 6126
11-15 2万+
07-03 1万+
08-14 1万+