【Data Algorithms_Recipes for Scaling up with Hadoop and Spark】Chapter 13 k-Nearest Neighbors

:scala版算法实现

package com.bbw5.dataalgorithms.spark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import breeze.linalg.DenseVector
/**
 * This class solves K-Nearest-Nerigbor join operation using Spark API.
 *
 * knn-query.txt:
 * <unique-record-id><;><a-1><,><a-2><,>...<,><a-d>
 * knn-train.txt:
 * <unique-record-id><;><classification-id><;><b-1><,><b-2><,>...<,><b-d>
 *
 * author:baibaiw5
 */
object SparkKNN {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("SparkKNN")
    val sc = new SparkContext(sparkConf)

    val K = sc.broadcast(3)
    val textQuery = sc.textFile("G:/temp/data/knn-query.txt")
    val textTrain = sc.textFile("G:/temp/data/knn-train.txt")

    val knnQuery = textQuery.map { _.split(";") }.map { array =>
      array(0) -> DenseVector(array(1).split(",").map { _.toDouble })
    }

    val knnTrain = textTrain.map { _.split(";") }.map { array =>
      array(0) -> array(1).toInt -> DenseVector(array(2).split(",").map { _.toDouble })
    }

    knnQuery.collect().foreach(println)
    knnTrain.collect().foreach(println)

    val data = knnQuery.cartesian(knnTrain).map {
      case (tup1, tup2) =>
        //(id,(distance,lable))
        tup1._1 -> (tup1._2.dot(tup2._2) -> tup2._1._2)
    }.groupByKey().mapValues { items =>
      //find nearest k points,then vote for label
      items.toArray.sortBy(_._1).take(K.value).map(item => item._2).groupBy(a => a).mapValues {
        _.size
      }.toArray.sortBy(-_._1).apply(0)._1
    }

    data.collect().foreach(println)
  }
}


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值