数据格式:
*****,114.766907,35.218128,14,*****,***
****,114.969452,35.323708,30,0***,***
*****,114.879410,35.267296,80,***,***
*****,114.766907,35.218128,14,*****,***
package com.agm.kmeans
import java.io.Fileimport java.io.PrintWriter
import java.io.File
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.log4j.{ Level, Logger }
object start {
def main(args: Array[String]) {
Logger.getLogger("org").setLevel(Level.ERROR)
val conf = new SparkConf().setAppName("Simple Application") //给Application命名
conf.setMaster("local")
val sc = new SparkContext(conf)
val data = sc.textFile("F:\\testData\\spark\\addressAllInfoUTF.txt")
val data1 = data.map(f => f.split(',')).map(f => (f(1) + " " + f(2)))
val parsedData = data1.map(s => Vectors.dense(s.split(' ').map(_.trim.toDouble * 1000))).cache()
//parsedData.foreach(println)
//设置簇的个数为3
val numClusters = 200
//迭代20次
val numIterations = 50
//运行10次,选出最优解
val runs = 10
//设置初始K选取方式为k-means++
val initMode = "k-means||"
val clusters = new KMeans().
setInitializationMode(initMode).
setK(numClusters).
setMaxIterations(numIterations).
run(parsedData)
//打印出测试数据属于哪个簇
//println(parsedData.map(v=> v.toString() + " belong to cluster :" +clusters.predict(v)).collect().mkString("\n"))
// Evaluateclustering by computing Within Set Sum of Squared Errors
val WSSSE = clusters.computeCost(parsedData)
//val count = clusters.clusterCenters(0).size
println()
println("WithinSet Sum of Squared Errors = " + WSSSE)
val res = parsedData.map(f => (clusters.predict(f), 1)).reduceByKey((a, b) => (a + b)).map(f => f._2).collect()
val a21 = clusters.predict(Vectors.dense(1.2, 1.3))
val a22 = clusters.predict(Vectors.dense(4.1, 4.2))
//打印出中心点
println("Clustercenters:")
val writer = new PrintWriter(new File("F:\\testData\\spark\\learningScala.txt"))
var i = 0
for (center <- clusters.clusterCenters) {
writer.println((center(0) / 1000) + " " + (center(1) / 1000) + " " + res(i))
i += 1
//println(" "+ center)
}
writer.close()
println("Prediction of (1.2,1.3)-->" + a21)
println("Prediction of (4.1,4.2)-->" + a22)
}
}