Spark Kmeans算法




spark-submit --class com.ones.soc.cf.KMeansClustering --master yarn --num-executors 3 --driver-memory 5g --executor-memory 4g /root/bigData.jar /ones/mldata/test1 /ones/mldata/test2 8 30 3 /ones/result/12345

##############################################

package com.ones.soc.cf

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{Path, FileSystem}
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import org.apache.spark.mllib.linalg.Vectors

/**
* Created by tom
*/
object KMeansClustering {

def main(args: Array[String]) {
if(args.length < 6){
println("Usage:KMeansClustering trainingDataFilePath testDataFilePath numClusters numIterations runTimes outpath")
sys.exit(1)
}

val confighdfs = new Configuration();
val fs=FileSystem.get(confighdfs) ;
if(args(5) != null && args(5).trim().length > 1){
val output = new Path(args(5));
if(fs.exists(output)){ //删除输出目录
fs.delete(output, true);
}
}

val conf = new SparkConf().setAppName("K-Means")
val sc = new SparkContext(conf)
val rawTrainingData = sc.textFile(args(0))
val parsedTrainingData =
rawTrainingData.filter(!isColumnNameLine(_)).map(line => {
Vectors.dense(line.split("\t").map(_.trim).filter(!"".equals(_)).map(_.toDouble))
}).cache()

//Cluster the data into two classes using KMeans
val numClusters = args(2).toInt
val numIterations = args(3).toInt
val runTimes = args(4).toInt
var clusterIndex: Int = 0
val clusters: KMeansModel = KMeans.train(parsedTrainingData, numClusters, numIterations, runTimes)

println("Cluster Number:" + clusters.clusterCenters.length)
println("Cluster Centers Information Overview:")
clusters.clusterCenters.foreach(
x => {
println("Center Point of Cluster " + clusterIndex + ":")
println(x)
clusterIndex += 1
})
//begin to check which cluster each test data belongs to based on the clustering result
val rawTestData = sc.textFile(args(1))
val parsedTestData = rawTestData.map(line => {
Vectors.dense(line.split("\t").map(_.trim).filter(!"".equals(_)).map(_.toDouble))
})

val sb=new StringBuilder()
parsedTestData.collect().foreach(testDataLine => {
val predictedClusterIndex:
Int = clusters.predict(testDataLine)
println("The data " + testDataLine.toString + " belongs to cluster " +predictedClusterIndex)
sb.append(testDataLine.toString).append("\t").append("belongs to cluster ").append(predictedClusterIndex).append("\r\n")
})

outputHdfs(fs,sb.toString(),args(5))
println("Spark MLlib K-means clustering test finished.")
}

private def isColumnNameLine(line: String): Boolean = {
if (line != null &&
line.contains("Channel")) true
else false
}

def outputHdfs(fs:FileSystem,text:String,textdir:String):Unit={
try{
val fsDataOutputStream = fs.create(new Path(textdir+"/result.txt"), true);
val s=text.getBytes("UTF-8")
fsDataOutputStream.write(s,0,s.length)
fsDataOutputStream.hflush();
}catch{
case e:Exception =>
}
}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值