/**
* 1.0 1.0
* 1.0 2.0
* 2.0 1.0
* 2.0 2.0
* 3.0 3.0
* 3.0 4.0
* 4.0 3.0
* 4.0 4.0
*/
package com.spark.test
import org.apache.spark.{ SparkContext, SparkConf }
import org.apache.spark.SparkContext._
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
object ObKMeans {
def main(args: Array[String]) {
run()
}
def run() {
val conf = new SparkConf().setAppName("ObKMeansTest")
val sc = new SparkContext(conf)
// Load and parse the data
val data = sc.textFile("/ruson/kmean.txt")
// org.apache.spark.mllib.linalg.Vector
// val parsedData = data.map( _.split(' ').map(_.toVector))
val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble)))
// Cluster the data into two classes using KMeans
val numIterations = 20
val numClusters = 4
val clusters = KMeans.train(parsedData, numClusters, numIterations)
// Evaluate clustering by computing Within Set Sum of Squared Errors
val WSSSE = clusters.computeCost(parsedData)
println("Within Set Sum of Squared Errors = " + WSSSE)
val result = parsedData.map(point => clusters.predict(point))
val resultFile = "/ruson/KMeansResult"
result.saveAsTextFile(resultFile)
println("Result file : " + resultFile)
}
}
上面是数据集;运行之后结果分为四类如下