kmeans的计算简单,聚类速度快,优化方法也简单,keamn++即可
在初始的kmeans之前加上一个随机选择中心点然后计算更远的点作为中心即可
也可以用指定聚类点的方法控制聚类的簇范围
甚至可以用凸包算法对边缘的点进行控制选择提高聚类的分离性
df_for_kmeans=df[pd.isna(df1.经度)!=True]
df_for_kmeans=df[pd.isna(df1.纬度)!=True]
kmeans_data = df_for_kmeans[["纬度","经度"]].to_numpy()
train_x ,train_y,test_x,test_y = kmeans_data[:10000],df_for_kmeans.职位名称.to_numpy()[:10000],kmeans_data[10000:],df_for_kmeans.职位名称.to_numpy()[10000:]
def norm(vector,p=6):
'''
向量的范数
'''
return (vector**p).dot(np.ones_like(vector))**(1/p)
def difference_in_norms(vector1, vector2, p=3):
"""
闵氏距离
当p=1时,就是曼哈顿距离
当p=2时,就是欧氏距离
当p→∞时,就是切比雪夫距离
:param vec1:
:param vec2:
:param p:
:return
"""
#print(np.linalg.norm(vec1 - vec2, ord=p))
return norm(vector1-vector2,p)
#return sum((x - y) ** p for (x, y) in zip(vec1, vec2)) ** (1 / p)
#return np.linalg.norm(vec1 - vec2, ord=p)
def centroids_init(k=8):
"""
初始化质心
"""
return train_x[np.random.choice(a=np.arange(train_x.shape[0]),size=k, replace=False, p=None)] #生成指定的中心点
def closest_centroid(centroid,x,p=2):
'''
计算最近的质心,拿到距离最近的k的索引
'''
m,n = centroid.shape
closest_dist = np.inf
for _ in range(m):
distance = difference_in_norms(centroid[_],x,p)
if distance < closest_dist:
closest_dist = distance
index_ = _
return index_
def clustering(centroid,train_x,p=2):
"""
单步聚类,拿到每一类x的索引
"""
m,n = train_x.shape
classification = dict()
for x_i in range(m):
k_i = closest_centroid(centroid,train_x[x_i])
if k_i in classification:
classification[k_i].append(x_i)
else:
classification.update({k_i:[x_i]})
return classification
def rearrange_centroids(train_x,clusters,p=2):
'''
重新计算质心
'''
m,n = train_x.shape
n_clusters= (len(clusters))
init_new_centroids = np.zeros((n_clusters,n))
for kNo,i in zip(clusters,range(n_clusters)):
clusters_index = clusters[kNo]
old_clusters = train_x[clusters_index]
n_of_this_clusters = old_clusters.shape[0]
new_clusters = old_clusters.T.dot(np.ones(n_of_this_clusters))/n_of_this_clusters
init_new_centroids[i]=new_clusters
return init_new_centroids
def Kmeans(train_x,y_lab,k,p=2,max_iter=10):
centroids = centroids_init(k)
rl = []
for _ in range(max_iter):
clusters = clustering(centroids,train_x,p)
previous = centroids
centroids = rearrange_centroids(train_x,clusters,p)
rl.append(centroids.sum()-previous.sum())
if rl[-1] == 0:
break
print(_)
print(_)
return {"index":[clusters[i] for i in clusters],'loss':rl,"time":_,"centroids":centroids,"centroids_size":centroids.shape[0]}
#测试
%%time
index_of_kmearn = dict()
for i in range(10):
temp = Kmeans(train_x,train_y,k=12,p=3)
index_s ,loss,time,centroids,centroids_size= temp["index"] , temp["loss"], temp["time"],temp["centroids"],temp["centroids_size"]
if centroids_size in index_of_kmearn:
index_of_kmearn[centroids_size].append(index_s)
else:
index_of_kmearn.update({centroids_size:index_s})
plt.figure(figsize=(16,9))
for index_ in index_s:
class_x = train_x[index_]
x,y = class_x[:,0],class_x[:,-1]
plt.scatter(x,y)
plt.show()
简单测试聚类效果
import cupy as cp
import numpy as np
import matplotlib.pyplot as plt
def norm_cuda(vector,p=2):
'''
向量的范数
'''
return (vector**p).dot(cp.ones_like(vector))**(1/p)
def difference_in_norms_cuda(vector1, vector2,p=3):
"""
闵氏距离
当p=1时,就是曼哈顿距离
当p=2时,就是欧氏距离
当p→∞时,就是切比雪夫距离
:param vec1:
:param vec2:
:param p:
:return
"""
#print(np.linalg.norm(vec1 - vec2, ord=p))
return norm_cuda(vector1-vector2,p=3)
#return sum((x - y) ** p for (x, y) in zip(vec1, vec2)) ** (1 / p)
#return np.linalg.norm(vec1 - vec2, ord=p)
def centroids_init_cuda(train_x,k=8):
"""
初始化质心
"""
'''
m,n = train_x.shape # 训练集的维度大小
max_row = train_x.max(axis=0).tolist() #行的最大值
min_row = train_x.min(axis=0).tolist() #行的最小值
centroids = cp.zeros((n,k))
#print(centroids,max_row)
for i in range(n):
centroids[i] = cp.random.randint(int(min_row[i]),int(max_row[i]),k)
'''
return train_x[cp.random.choice(a=cp.arange(train_x.shape[0]),size=k, replace=False, p=None)] #生成指定的中心点
def closest_centroid_cuda(centroid,x,p=2):
'''
计算最近的质心,拿到距离最近的k的索引
'''
m,n = centroid.shape
closest_dist = float("inf")
for _ in range(m):
vector = centroid[_]-x
distance = (vector**p).dot(cp.ones_like(vector))**(1/p)#difference_in_norms_cuda(centroid[_],x,p)
if distance < closest_dist:
closest_dist = distance
index_ = _
#print(distance)
return index_
def clustering_cuda(centroid,train_x,p=2):
"""
单步聚类,拿到每一类x的索引
"""
m,n = train_x.shape
classification = dict()
for x_i in range(m):
k_i = closest_centroid_cuda(centroid,train_x[x_i])
if k_i in classification:
classification[k_i].append(x_i)
else:
classification.update({k_i:[x_i]})
return classification
def rearrange_centroids_cuda(train_x,clusters,p=2):
'''
重新计算质心
'''
m,n = train_x.shape
n_clusters= (len(clusters))
init_new_centroids_cuda = cp.zeros((n_clusters,n))
for kNo,i in zip(clusters,range(n_clusters)):
clusters_index = clusters[kNo]
old_clusters = train_x[clusters_index]
n_of_this_clusters = old_clusters.shape[0]
new_clusters = old_clusters.T.dot(cp.ones(n_of_this_clusters))/n_of_this_clusters
init_new_centroids_cuda[i]=new_clusters
return init_new_centroids_cuda
def Kmeans_cuda(train_x,k,p=2,max_iter=10):
centroids = centroids_init_cuda(train_x,k)
print("初始质心:",centroids.shape)
rl = []
for _ in range(max_iter):
clusters = clustering_cuda(centroids,train_x,p)
print("聚类后质心:",len(clusters),"迭代整体轮次:",_)
previous = centroids
centroids = rearrange_centroids_cuda(train_x,clusters,p)
rl.append(centroids.sum()-previous.sum())
if rl[-1] == 0:
break
print(_)
return {"index":[clusters[i] for i in clusters],'loss':rl,"time":_,"centroids":centroids,"centroids_size":centroids.shape[0]}
#数据维度不是十分巨大的情况下,显卡聚类似乎不是一个高效的选择 ,但是面对超大数据纬度规模的时候,优势就相当明显了