部分数据:
1.658985 4.285136
-3.453687 3.424321
4.838138 -1.151539
-5.379713 -3.362104
0.972564 2.924086
-3.567919 1.531611
0.450614 -3.302219
-3.487105 -1.724432
2.668759 1.594842
-3.156485 3.191137
3.165506 -3.999838
-2.786837 -3.099354
4.208187 2.984927
代码:
package workStudy.MLlib
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by Administrator on 2016/10/11.
*/
/**
* 聚类操作
*/
object day2 {
def main(args: Array[String]) {
val conf = new SparkConf()
.setMaster("local")
.setAppName("day2")
val sc = new SparkContext(conf)
// val data=MLUtils.loadLibSVMFile(sc,"E://machinedata/KNN/Sparkkmean.txt")
val data = sc.textFile("E://machinedata/kMeans_demo/testSet.txt") //输入数据集
val parsedData = data.map(s => Vectors.dense(s.split('\t').map(_.trim.toDouble))).cache() //训练数据
val numClusters = 2 //聚类中心
val numIterations = 20 //迭代次数
val model = KMeans.train(parsedData, numClusters, numIterations) //训练模型
model.clusterCenters.foreach(println) //聚类中心点坐标
}
}
运行结果:
[-3.0953906153846154,-2.262190846153846]
[1.3368722037037037,1.1696492222222223]