看到 程序员的自我修养 – SelfUp.cn 里面有Spark MLlib之K-Means聚类算法。
但是是java 语言的,于是我按照例程用Scala写了一个,分享在此。
由于在学习 spark mllib 但是如此详细的资料真的很难找,在此分享。
测试数据
package com.spark.firstApp
import org.apache.spark.SparkContextimport org.apache.spark.SparkConf
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
object HelloSpark {
def main(args:Array[String]): Unit = {
val conf = new SparkConf().setAppName("SimpleSVM Application")
val sc = new SparkContext(conf)
val data = sc.textFile("hdfs://192.168.0.10:9000/user/root/home/data1.txt")
val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))).cache()
// Cluster the data into two classes using KMeans
val numClusters = 2
val numIterations = 20
val clusters = KMeans.train(parsedData, numClusters, numIterations)
// Evaluate clustering by computing Within Set Sum of Squared Errors
val WSSSE = clusters.computeCost(parsedData)
println("Within Set Sum of Squared Errors = " + WSSSE)
val ss=parsedData.map(v => v.toString+ " belong to cluster :" + clusters.predict(v)).collect
ss.foreach(a=>println(a.toString))
}
}
root@Master:/# spark-submit --master spark://192.168.0.10:7077 --class com.spark.firstApp.HelloSpark --executor-memory 100m /root/IdeaProjects/FirstSparkApp/out/artifacts/FirstSparkAppjar/FirstSparkApp.jar
运行结果如下:
[0.0,0.0,0.0] belong to cluster :1
[0.1,0.1,0.1] belong to cluster :1
[0.2,0.2,0.2] belong to cluster :1
[9.0,9.0,9.0] belong to cluster :0
[9.1,9.1,9.1] belong to cluster :0
[9.2,9.2,9.2] belong to cluster :0
[15.1,16.1,17.0] belong to cluster :0
[18.0,17.0,19.0] belong to cluster :0
[20.0,21.0,22.0] belong to cluster :0
Prediction of (1.1, 2.1, 3.1): 1