kmeans各种


一、
scala> def loadLibSVMFile(sc:SparkContext,path:String,numFeatures:Int,mainPartitions:Int): RDD[LabeledPoint]={
     | val parsed =sc. textFile("/home/sc/Desktop/data.txt",2)
     | val parsed =sc. textFile("/home/sc/Desktop/data.txt",2).map(_.trim)
     | .filter(line => !(line.isEmpty || line.startsWith("#")))
     | .map{line =>
     | val items = line.spilt(' ')
     | val label = items.head.toDouble
     | val (indices,values)=items.tail.filter(_.nonEmpty).map{ item =>
     | val indexAndValue = item.spilt(' ')
     | val (indices,values)=items.tail.filter(_.nonEmpty).map{ item =>
     | val indexAndValue = item.spilt(':')
     | val index =indexAndValue(0).toInt - 1
     | val value = indexAndValue(1).toDouble
     | (index, value)
     | }.unzip
     | (label, indices,toArray, value.toArray)
     | }
二、
 def loadLibSVMFile(sc:SparkContext,path:String,numFeatures:Int,mainPartitions:Int)={
     val parsed =sc. textFile("/home/sc/Desktop/data.txt",2).map(_.trim)
      .filter(line => !(line.isEmpty || line.startsWith("#")))
     .map{line =>
      val items = line.spilt(' ')
      val label = items.head.toDouble
      val (indices,values)=items.tail.filter(_.nonEmpty).map{ item =>
      val indexAndValue = item.spilt(':')
      val index =indexAndValue(0).toInt - 1
      val value = indexAndValue(1).toDouble
      (index, value)
      }.unzip
      (label, indices,toArray, value.toArray)
      }
      val d = if (numFeatures > 0)
      {
      numFeatures
      }else{
      parsed.persist(StorageLevel1.MEMORY_ONLY)
      parsed.map{ case (label, indices, values) =>
      indices.lastOption.getOrElse(0)
      }.reduce(math.max) + 1
      }
      parsed.map{ case (label, indices, values
      ) =>
      LabledPoint(labels, Vectors.sparse(d, indices, values))
      }
      }
三、spark examples
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


// scalastyle:off println
package org.apache.spark.examples


import breeze.linalg.{Vector, DenseVector, squaredDistance}


import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.SparkContext._


/**
 * K-means clustering.
 *
 * This is an example implementation for learning how to use Spark. For more conventional use,
 * please refer to org.apache.spark.mllib.clustering.KMeans
 */
object SparkKMeans {


  def parseVector(line: String): Vector[Double] = {
    DenseVector(line.split(' ').map(_.toDouble))
  }


  def closestPoint(p: Vector[Double], centers: Array[Vector[Double]]): Int = {
    var bestIndex = 0
    var closest = Double.PositiveInfinity


    for (i <- 0 until centers.length) {
      val tempDist = squaredDistance(p, centers(i))
      if (tempDist < closest) {
        closest = tempDist
        bestIndex = i
      }
    }


    bestIndex
  }


  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
        |Please use the KMeans method found in org.apache.spark.mllib.clustering
        |for more conventional use.
      """.stripMargin)
  }


  def main(args: Array[String]) {


    if (args.length < 3) {
      System.err.println("Usage: SparkKMeans <file> <k> <convergeDist>")
      System.exit(1)
    }


    showWarning()


    val sparkConf = new SparkConf().setAppName("SparkKMeans")
    val sc = new SparkContext(sparkConf)
    val lines = sc.textFile(args(0))
    val data = lines.map(parseVector _).cache()
    val K = args(1).toInt
    val convergeDist = args(2).toDouble


    val kPoints = data.takeSample(withReplacement = false, K, 42).toArray
    var tempDist = 1.0


    while(tempDist > convergeDist) {
      val closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))


      val pointStats = closest.reduceByKey{case ((p1, c1), (p2, c2)) => (p1 + p2, c1 + c2)}


      val newPoints = pointStats.map {pair =>
        (pair._1, pair._2._1 * (1.0 / pair._2._2))}.collectAsMap()


      tempDist = 0.0
      for (i <- 0 until K) {
        tempDist += squaredDistance(kPoints(i), newPoints(i))
      }


      for (newP <- newPoints) {
        kPoints(newP._1) = newP._2
      }
      println("Finished iteration (delta = " + tempDist + ")")
    }


    println("Final centers:")
    kPoints.foreach(println)
    sc.stop()
  }
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

星之擎

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值