首先是参数
k : 聚类数,默认2
initMode : 初始化算法的参数,可以是RANDOM或K_MEANS_PARALLEL,RANDOM是随机选择初始聚类中心,K_MEANS_PARALLEL是使用算法选择初始聚类中心,也是默认情况
initSteps : K_MEANS_PARALLEL方法迭代步数,默认5
接下来是一些重要的方法
private[clustering] def predict(features: Vector): Int = parentModel.predict(features)//predict预测新的属性属于哪个类
def clusterCenters: Array[Vector] = parentModel.clusterCenters//列出最终的聚类中心
def computeCost(dataset: DataFrame): Double = {//计算距离平方的总和
SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT)
val data = dataset.select(col($(featuresCol))).map { case Row(point: Vector) => point }
parentModel.computeCost(data)//主要是调用computeCost方法
}
下面是聚类算法默认的参数
setDefault(//2类,迭代20次,初始化算法为K_MEANS_PARALLEL,该算法迭代5次,收敛参数0.0001
k -> 2,
maxIter -> 20,
initMode -> MLlibKMeans.K_MEANS_PARALLEL,
initSteps -> 5,
tol -> 1e-4)
调用fit,填充参数
override def fit(dataset: DataFrame): KMeansModel = {
val rdd = dataset.select(col($(featuresCol))).map { case Row(point: Vector) => point }
val algo = new MLlibKMeans()
.setK($(k))
.setInitializationMode($(initMode))
.setInitializationSteps($(initSteps))
.setMaxIterations($(maxIter))
.setSeed($(seed))
.setEpsilon($(tol))
val parentModel = algo.run(rdd)//最重要的就是这里,algo是MLlibKMeans类,传入要聚类的rdd,调用run方法,得到模型,下面我们就看下这个是如何运行的
val model = new KMeansModel(uid, parentModel)
copyValues(model)
}
run方法
def run(data: RDD[Vector]): KMeansModel = {
if (data.getStorageLevel == StorageLevel.NONE) {
logWarning("The input data is not directly cached, which may hurt performance if its"
+ " parent RDDs are also uncached.")
}
// Compute squared norms and cache them.
val norms = data.map(Vectors.norm(_, 2.0))//计算每个向量的2范数,即平方和开放
norms.persist()//缓存,因为要多次用这个
val zippedData = data.zip(norms).map { case (v, norm) =>//把向量和2范数放在一起,VectorWithNorm是新的结构
new VectorWithNorm(v, norm)
}
val model = runAlgorithm(zippedData)//把VectorWithNorm放里面运行,返回模型
norms.unpersist()//释放缓存的2范数