import numpy as np
def parseVector(line):
return np.array([float(x) for x in line.split()])
def closestPoint(p, centers):
bestIndex = 0
closest = float("+inf")
for i in range(len(centers)):
tempDist = np.sum((p - centers[i]) ** 2)
if tempDist < closest:
closest = tempDist
bestIndex = i
return bestIndex
# The data file can be downloaded at http://www.cse.ust.hk/msbd5003/data/kmeans_data.txt
lines = sc.textFile('/Users/huangluyu/data/kmeans_data.txt', 5)
# The data file can be downloaded at http://www.cse.ust.hk/msbd5003/data/kmeans_bigdata.txt
# lines = sc.textFile('../data/kmeans_bigdata.txt', 5)
# lines is an RDD of strings
K = 3
convergeDist = 0.01
# terminate algorithm when the total distance from old center to new centers is less than this value
data = lines.map(parseVector).cache() # data is an RDD of arrays
kCenters = data.t
RDD用法与实例(十):spark中rdd实现k-means
最新推荐文章于 2023-04-09 14:35:38 发布