kmeans各种

原创 2016年08月28日 23:20:46

一、
scala> def loadLibSVMFile(sc:SparkContext,path:String,numFeatures:Int,mainPartitions:Int): RDD[LabeledPoint]={
     | val parsed =sc. textFile("/home/sc/Desktop/data.txt",2)
     | val parsed =sc. textFile("/home/sc/Desktop/data.txt",2).map(_.trim)
     | .filter(line => !(line.isEmpty || line.startsWith("#")))
     | .map{line =>
     | val items = line.spilt(' ')
     | val label = items.head.toDouble
     | val (indices,values)=items.tail.filter(_.nonEmpty).map{ item =>
     | val indexAndValue = item.spilt(' ')
     | val (indices,values)=items.tail.filter(_.nonEmpty).map{ item =>
     | val indexAndValue = item.spilt(':')
     | val index =indexAndValue(0).toInt - 1
     | val value = indexAndValue(1).toDouble
     | (index, value)
     | }.unzip
     | (label, indices,toArray, value.toArray)
     | }
二、
 def loadLibSVMFile(sc:SparkContext,path:String,numFeatures:Int,mainPartitions:Int)={
     val parsed =sc. textFile("/home/sc/Desktop/data.txt",2).map(_.trim)
      .filter(line => !(line.isEmpty || line.startsWith("#")))
     .map{line =>
      val items = line.spilt(' ')
      val label = items.head.toDouble
      val (indices,values)=items.tail.filter(_.nonEmpty).map{ item =>
      val indexAndValue = item.spilt(':')
      val index =indexAndValue(0).toInt - 1
      val value = indexAndValue(1).toDouble
      (index, value)
      }.unzip
      (label, indices,toArray, value.toArray)
      }
      val d = if (numFeatures > 0)
      {
      numFeatures
      }else{
      parsed.persist(StorageLevel1.MEMORY_ONLY)
      parsed.map{ case (label, indices, values) =>
      indices.lastOption.getOrElse(0)
      }.reduce(math.max) + 1
      }
      parsed.map{ case (label, indices, values
      ) =>
      LabledPoint(labels, Vectors.sparse(d, indices, values))
      }
      }
三、spark examples
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


// scalastyle:off println
package org.apache.spark.examples


import breeze.linalg.{Vector, DenseVector, squaredDistance}


import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.SparkContext._


/**
 * K-means clustering.
 *
 * This is an example implementation for learning how to use Spark. For more conventional use,
 * please refer to org.apache.spark.mllib.clustering.KMeans
 */
object SparkKMeans {


  def parseVector(line: String): Vector[Double] = {
    DenseVector(line.split(' ').map(_.toDouble))
  }


  def closestPoint(p: Vector[Double], centers: Array[Vector[Double]]): Int = {
    var bestIndex = 0
    var closest = Double.PositiveInfinity


    for (i <- 0 until centers.length) {
      val tempDist = squaredDistance(p, centers(i))
      if (tempDist < closest) {
        closest = tempDist
        bestIndex = i
      }
    }


    bestIndex
  }


  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
        |Please use the KMeans method found in org.apache.spark.mllib.clustering
        |for more conventional use.
      """.stripMargin)
  }


  def main(args: Array[String]) {


    if (args.length < 3) {
      System.err.println("Usage: SparkKMeans <file> <k> <convergeDist>")
      System.exit(1)
    }


    showWarning()


    val sparkConf = new SparkConf().setAppName("SparkKMeans")
    val sc = new SparkContext(sparkConf)
    val lines = sc.textFile(args(0))
    val data = lines.map(parseVector _).cache()
    val K = args(1).toInt
    val convergeDist = args(2).toDouble


    val kPoints = data.takeSample(withReplacement = false, K, 42).toArray
    var tempDist = 1.0


    while(tempDist > convergeDist) {
      val closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))


      val pointStats = closest.reduceByKey{case ((p1, c1), (p2, c2)) => (p1 + p2, c1 + c2)}


      val newPoints = pointStats.map {pair =>
        (pair._1, pair._2._1 * (1.0 / pair._2._2))}.collectAsMap()


      tempDist = 0.0
      for (i <- 0 until K) {
        tempDist += squaredDistance(kPoints(i), newPoints(i))
      }


      for (newP <- newPoints) {
        kPoints(newP._1) = newP._2
      }
      println("Finished iteration (delta = " + tempDist + ")")
    }


    println("Final centers:")
    kPoints.foreach(println)
    sc.stop()
  }
}
版权声明:本文为博主原创文章,未经博主允许不得转载。

相关文章推荐

Kmeans算法python实现

超越kmeans:聚类算法概述

聚类算法综述,各种类型聚类算法总结

kmeans聚类方法的使用

  • 2016-02-01 14:56
  • 13KB
  • 下载

kmeans算法

  • 2015-05-31 19:22
  • 2.21MB
  • 下载

Kmeans python 可视化

就是用PLT, 上一篇中主要写了kmeans图像分割的算法主体 这里对他做可视化 可视化两个函数, 第一个是visualize,遍历一遍不同的K值的图像,每个暂停0.7s,嘿嘿嘿可以...

kmeans算法应用及实现

kmeans代码

无监督机器学习的kmeans思想介绍

最近在看一些基本的简单算法,思想空间中邻近点是有关联的,这当然我也不知道为什么就对了,虽然我们简单的低维空间中很直观是这样的,但是高维空间中的点是不是就真的有关系呢,这个只能是一种猜想。 k...

Kmeans_C.dsw

  • 2015-05-18 13:37
  • 522B
  • 下载
内容举报
返回顶部
收藏助手
不良信息举报
您举报文章:深度学习:神经网络中的前向传播和反向传播算法推导
举报原因:
原因补充:

(最多只允许输入30个字)