numpy实现k-means
实现1
import numpy as np
import matplotlib.pyplot as plt
'''标志位统计递归运行次数'''
flag = 0
'''欧式距离'''
def ecludDist(x, y):
return np.sqrt(sum(np.square(np.array(x) - np.array(y))))
'''曼哈顿距离'''
def manhattanDist(x, y):
return np.sum(np.abs(x - y))
'''夹角余弦'''
def cos(x, y):
return np.dot(x, y)/(np.linalg.norm(x) * np.linalg.norm(y))
'''计算簇的均值点'''
def clusterMean(dataset):
return sum(np.array(dataset)) / len(dataset)
'''生成随机均值点'''
def randCenter(dataset, k):
temp = []
while len(temp) < k:
index = np.random.randint(0, len(dataset)-1)
if index not in temp:
temp.append(index)
return np.array([dataset[i] for i in temp])
'''以数据集的前k个点为均值点'''
def orderCenter(dataset, k):
return np.array([dataset[i] for i in range(k)])
'''聚类'''
def kMeans(dataset, dist, center, k):
global flag
#all_kinds用于存放中间计算结果
all_kinds = []
for _ in range(k):
temp = []
all_kinds.append(temp)
#计算每个点到各均值点的距离
for i in dataset:
temp = []
for j in center:
temp.append(dist(i, j))
all_kinds[temp.index(min(temp))].append(i)
#打印中间结果
for i in range(k):
print('第'+str(i)+'组:', all_kinds[i], end='\n')
flag += 1
print('************************迭代'+str(flag)+'次***************************')
#更新均值点
center_ = np.array([clusterMean(i) for i in all_kinds])
if (center_ == center).all():
print('结束')
for i in range(k):
print('第'+str(i)+'组均值点:', center_[i], end='\n')
plt.scatter([j[0] for j in all_kinds[i]], [j[1] for j in all_kinds[i]], marker='*')
plt.grid()
plt.show()
else:
#递归调用kMeans函数
center = center_
kMeans(dataset, dist, center, k)
def main(k):
'''生成随机点'''
x = [np.random.randint(0, 50) for _ in range(50)]
y = [np.random.randint(0, 50) for _ in range(50)]
points = [[i,j] for i, j in zip(x, y)]
plt.plot(x, y, 'b.')
plt.show()
initial_center = randCenter(dataset=points, k=k)
kMeans(dataset=points, dist=ecludDist, center=initial_center, k=k)
if __name__ == '__main__':
main(3)
实现2
import numpy as np
import matplotlib.pyplot as plt
# 返回距离样本最近的质心的下标索引
def group_one(sample, centers):
distance_vect = np.sum((sample-centers)**2, axis=1)
return np.argmin(distance_vect)
# 将所有样本分组到k个质心,返回二维列表[[属于分组1的样本][属于分组2的样本]...]
def group_all(data, k, centers):
# 这里使用二维列表,而不是ndarray的原因在于,每个分组的大小,也就是样本的个数是不确定的
# 而array是确定大小的,强行转换这里会变成列表对象的数组,效率低下且更容易出错
# 如果有更好的做法欢迎交流
groups = []
for index in range(k):
groups.append([])
# 对每一个样本进行分组
for sample in data:
index = group_one(sample, centers)
groups[index].append(sample.tolist())
return groups
# 根据样本分组,更新每个质心的位置
def update_centers(data, k, groups):
centers = np.zeros((k, data.shape[1]))
for index in range(k):
centers[index] = np.mean(np.array(groups[index]), axis=0)
return centers
# 检测与上一次迭代的更新差值
def iter_diff(old_centers, new_centers):
return np.sum(np.abs(old_centers - new_centers))
# 生成随机质心
def rand_center(data, k):
# 共k个质心,data.shape[1]是每个数据样本的维度,质心的维度应与样本的维度一致。
centers = np.random.rand(k, data.shape[1])
# rand随机的范围是零到一,要适用于样本的范围需要进行缩放
# 这里使用样本在该维度的最大值作为每个维度上的缩放倍数
scale = np.max(data, axis=0)
centers *= scale
return centers
# 迭代主体函数
def classify(data, k, threshold, max_iter=0):
centers = rand_center(data, k)
loss = float("inf")
iter_count = 0
# 当loss小于阈值,或迭代次数大于指定最大次数时(若不指定则只判断loss足够低)终止
while loss > threshold and ((max_iter == 0) or iter_count < max_iter):
groups = group_all(data, k, centers)
old_centers = centers
centers = update_centers(data, k, groups)
loss = iter_diff(old_centers, centers)
iter_count += 1
print("iter_%d : loss=%f" % (iter_count, loss))
return centers, groups
# 绘图
def paint_result(data, centers, k, groups, debug=False):
c = []
flatten_group = []
for index in range(k):
for item in groups[index]:
c.append(index)
flatten_group.append(item)
groups = np.array(flatten_group)
if debug:
plt.scatter(groups[:, 0], groups[:, 1])
else:
plt.scatter(groups[:, 0], groups[:, 1],c=c)
plt.scatter(centers[:, 0], centers[:, 1], color="red")
plt.show()
def main():
data = np.loadtxt("./data.csv", delimiter=",")
data.resize((500, 2))
center, groups = classify(data, 3, 0, 0)
paint_result(data, center, 3, groups)
if __name__ == '__main__':
main()
问题1:欧氏距离和夹角余弦值的区别?
欧式距离公式
夹角余弦公式
例如:某T恤从100块降到了50块(A(100,50)),某西装从1000块降到了500块(B(1000,500))。那么T恤和西装都是降价了50%,两者的价格变动趋势一致,余弦相似度为最大值,即两者有很高的变化趋势相似度。但是从商品价格本身的角度来说,两者相差了好几百块的差距,欧氏距离较大,即两者有较低的价格相似度。