Spark 相似度算法

package com.sdcet

import org.apache.spark.{SparkConf, SparkContext}
/**
  * Created by Administrator on 2017/2/16.
  */
object TestColl {
  System.setProperty("hadoop.home.dir", "E:\\winutils-hadoop-2.6.4\\hadoop-2.6.4")
  val conf = new SparkConf().setMaster("local").setAppName(this.getClass().getSimpleName().filter(!_.equals('$')))
  println(this.getClass().getSimpleName().filter(!_.equals('$')))
  //设置环境变量
  val sc = new SparkContext(conf)
  //实例化环境
  val users = sc.parallelize(Array("aaa", "bbb", "ccc", "ddd", "eee"))
  //设置用户
  val films = sc.parallelize(Array("smzdm", "ylxb", "znh", "nhsc", "fcwr")) //设置电影名

  var source = Map[String, Map[String, Int]]()
  //使用一个source嵌套map作为姓名电影名和分值的存储
  val filmSource = Map[String, Int]()

  //设置一个用以存放电影分的map
  def getSource(): Map[String, Map[String, Int]] = {
    //设置电影评分
    val user1FilmSource = Map("smzdm" -> 2, "ylxb" -> 3, "znh" -> 1, "nhsc" -> 0, "fcwr" -> 1)
    val user2FilmSource = Map("smzdm" -> 1, "ylxb" -> 2, "znh" -> 2, "nhsc" -> 1, "fcwr" -> 4)
    val user3FilmSource = Map("smzdm" -> 2, "ylxb" -> 1, "znh" -> 0, "nhsc" -> 1, "fcwr" -> 4)
    val user4FilmSource = Map("smzdm" -> 3, "ylxb" -> 2, "znh" -> 0, "nhsc" -> 5, "fcwr" -> 3)
    val user5FilmSource = Map("smzdm" -> 5, "ylxb" -> 3, "znh" -> 1, "nhsc" -> 1, "fcwr" -> 2)
    source += ("aaa" -> user1FilmSource) //对人名进行存储
    source += ("bbb" -> user2FilmSource) //对人名进行存储
    source += ("ccc" -> user3FilmSource) //对人名进行存储
    source += ("ddd" -> user4FilmSource) //对人名进行存储
    source += ("eee" -> user5FilmSource) //对人名进行存储
    source //返回嵌套map
  }

  //两两计算分值,采用余弦相似性
  def getCollaborateSource(user1: String, user2: String): Double = {
    val user1FilmSource = source.get(user1).get.values.toVector //获得第1个用户的评分
    val user2FilmSource = source.get(user2).get.values.toVector //获得第2个用户的评分
    val member = user1FilmSource.zip(user2FilmSource).map(d => d._1 * d._2).reduce(_ + _).toDouble //对公式分子部分进行计算
    val temp1 = math.sqrt(user1FilmSource.map(num => {
        //求出分母第1个变量值
        math.pow(num, 2) //数学计算
      }).reduce(_ + _)) //进行叠加
    println("temp1:" + temp1)
    val temp2 = math.sqrt(user2FilmSource.map(num => {
      求出分母第2个变量值
      math.pow(num, 2) //数学计算
    }).reduce(_ + _)) //进行叠加
    val denominator = temp1 * temp2 //求出分母
    member / denominator //进行计算
  }

  def main(args: Array[String]) {
    System.setProperty("hadoop.home.dir", "E:\\winutils-hadoop-2.6.4\\hadoop-2.6.4")
    getSource() //初始化分数
    var name = "bbb" //设定目标对象
    users.foreach(user => {
      //迭代进行计算
      println(name + " 相对于 " + user + "的相似性分数是:" + getCollaborateSource(name, user))
    })
    println()
    name = "aaa"
    users.foreach(user => {
      //迭代进行计算
      println(name + " 相对于 " + user + "的相似性分数是:" + getCollaborateSource(name, user))
    })
  }

}

Console执行过程:

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties 17/02/16 17:17:31 INFO SparkContext: Running Spark version 1.6.3 17/02/16 17:17:33 INFO SecurityManager: Changing view acls to: Administrator 17/02/16 17:17:33 INFO SecurityManager: Changing modify acls to: Administrator 17/02/16 17:17:33 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(Administrator); users with modify permissions: Set(Administrator) 17/02/16 17:17:34 INFO Utils: Successfully started service 'sparkDriver' on port 51564. 17/02/16 17:17:34 INFO Slf4jLogger: Slf4jLogger started 17/02/16 17:17:34 INFO Remoting: Starting remoting 17/02/16 17:17:35 INFO Remoting: Remoting started; listening on addresses :[akka.tcp://sparkDriverActorSystem@192.168.164.1:51578] 17/02/16 17:17:35 INFO Utils: Successfully started service 'sparkDriverActorSystem' on port 51578. 17/02/16 17:17:35 INFO SparkEnv: Registering MapOutputTracker 17/02/16 17:17:35 INFO SparkEnv: Registering BlockManagerMaster 17/02/16 17:17:35 INFO DiskBlockManager: Created local directory at C:\Users\Administrator.WIN-20160809ARI\AppData\Local\Temp\blockmgr-97dae514-429b-4b5e-8850-3526d89da8b5 17/02/16 17:17:35 INFO MemoryStore: MemoryStore started with capacity 1807.0 MB 17/02/16 17:17:35 INFO SparkEnv: Registering OutputCommitCoordinator 17/02/16 17:17:36 INFO Utils: Successfully started service 'SparkUI' on port 4040. 17/02/16 17:17:36 INFO SparkUI: Started SparkUI at http://192.168.164.1:4040 17/02/16 17:17:36 INFO Executor: Starting executor ID driver on host localhost 17/02/16 17:17:36 INFO Utils: Successfully started service 'org.apache.spark.network.netty.NettyBlockTransferService' on port 51587. 17/02/16 17:17:36 INFO NettyBlockTransferService: Server created on 51587 17/02/16 17:17:36 INFO BlockManagerMaster: Trying to register BlockManager 17/02/16 17:17:36 INFO BlockManagerMasterEndpoint: Registering block manager localhost:51587 with 1807.0 MB RAM, BlockManagerId(driver, localhost, 51587) 17/02/16 17:17:36 INFO BlockManagerMaster: Registered BlockManager 17/02/16 17:17:37 INFO SparkContext: Starting job: foreach at TestColl.scala:60 17/02/16 17:17:37 INFO DAGScheduler: Got job 0 (foreach at TestColl.scala:60) with 1 output partitions 17/02/16 17:17:37 INFO DAGScheduler: Final stage: ResultStage 0 (foreach at TestColl.scala:60) 17/02/16 17:17:37 INFO DAGScheduler: Parents of final stage: List() 17/02/16 17:17:37 INFO DAGScheduler: Missing parents: List() 17/02/16 17:17:37 INFO DAGScheduler: Submitting ResultStage 0 (ParallelCollectionRDD[0] at parallelize at TestColl.scala:14), which has no missing parents 17/02/16 17:17:37 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 1248.0 B, free 1807.0 MB) 17/02/16 17:17:37 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 877.0 B, free 1807.0 MB) 17/02/16 17:17:37 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on localhost:51587 (size: 877.0 B, free: 1807.0 MB) 17/02/16 17:17:37 INFO SparkContext: Created broadcast 0 from broadcast at DAGScheduler.scala:1006 17/02/16 17:17:37 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 0 (ParallelCollectionRDD[0] at parallelize at TestColl.scala:14) 17/02/16 17:17:37 INFO TaskSchedulerImpl: Adding task set 0.0 with 1 tasks 17/02/16 17:17:37 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, localhost, partition 0,PROCESS_LOCAL, 2110 bytes) 17/02/16 17:17:37 INFO Executor: Running task 0.0 in stage 0.0 (TID 0) temp1:5.0990195135927845 bbb 相对于 aaa的相似性分数是:0.7089175569585667 temp1:5.0990195135927845 bbb 相对于 bbb的相似性分数是:1.0000000000000002 temp1:5.0990195135927845 bbb 相对于 ccc的相似性分数是:0.8780541105074453 temp1:5.0990195135927845 bbb 相对于 ddd的相似性分数是:0.6865554812287477 temp1:5.0990195135927845 bbb 相对于 eee的相似性分数是:0.6821910402406466 17/02/16 17:17:37 INFO Executor: Finished task 0.0 in stage 0.0 (TID 0). 915 bytes result sent to driver 17/02/16 17:17:37 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 151 ms on localhost (1/1) 17/02/16 17:17:37 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool  17/02/16 17:17:37 INFO DAGScheduler: ResultStage 0 (foreach at TestColl.scala:60) finished in 0.181 s 17/02/16 17:17:37 INFO DAGScheduler: Job 0 finished: foreach at TestColl.scala:60, took 0.514896 s 17/02/16 17:17:37 INFO SparkContext: Starting job: foreach at TestColl.scala:66 17/02/16 17:17:37 INFO DAGScheduler: Got job 1 (foreach at TestColl.scala:66) with 1 output partitions 17/02/16 17:17:37 INFO DAGScheduler: Final stage: ResultStage 1 (foreach at TestColl.scala:66) 17/02/16 17:17:37 INFO DAGScheduler: Parents of final stage: List() 17/02/16 17:17:37 INFO DAGScheduler: Missing parents: List() 17/02/16 17:17:37 INFO DAGScheduler: Submitting ResultStage 1 (ParallelCollectionRDD[0] at parallelize at TestColl.scala:14), which has no missing parents 17/02/16 17:17:37 INFO MemoryStore: Block broadcast_1 stored as values in memory (estimated size 1248.0 B, free 1807.0 MB) 17/02/16 17:17:37 INFO MemoryStore: Block broadcast_1_piece0 stored as bytes in memory (estimated size 876.0 B, free 1807.0 MB) 17/02/16 17:17:37 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on localhost:51587 (size: 876.0 B, free: 1807.0 MB) 17/02/16 17:17:37 INFO SparkContext: Created broadcast 1 from broadcast at DAGScheduler.scala:1006 17/02/16 17:17:37 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 1 (ParallelCollectionRDD[0] at parallelize at TestColl.scala:14) 17/02/16 17:17:37 INFO TaskSchedulerImpl: Adding task set 1.0 with 1 tasks 17/02/16 17:17:37 INFO TaskSetManager: Starting task 0.0 in stage 1.0 (TID 1, localhost, partition 0,PROCESS_LOCAL, 2110 bytes) 17/02/16 17:17:37 INFO Executor: Running task 0.0 in stage 1.0 (TID 1) temp1:3.872983346207417 aaa 相对于 aaa的相似性分数是:0.9999999999999999 temp1:3.872983346207417 aaa 相对于 bbb的相似性分数是:0.7089175569585667 temp1:3.872983346207417 aaa 相对于 ccc的相似性分数是:0.6055300708194983 temp1:3.872983346207417 aaa 相对于 ddd的相似性分数是:0.564932682866032 temp1:3.872983346207417 aaa 相对于 eee的相似性分数是:0.8981462390204985 17/02/16 17:17:37 INFO Executor: Finished task 0.0 in stage 1.0 (TID 1). 915 bytes result sent to driver 17/02/16 17:17:37 INFO TaskSetManager: Finished task 0.0 in stage 1.0 (TID 1) in 18 ms on localhost (1/1) 17/02/16 17:17:37 INFO DAGScheduler: ResultStage 1 (foreach at TestColl.scala:66) finished in 0.018 s 17/02/16 17:17:37 INFO TaskSchedulerImpl: Removed TaskSet 1.0, whose tasks have all completed, from pool  17/02/16 17:17:37 INFO DAGScheduler: Job 1 finished: foreach at TestColl.scala:66, took 0.033592 s 17/02/16 17:17:37 INFO SparkContext: Invoking stop() from shutdown hook 17/02/16 17:17:37 INFO SparkUI: Stopped Spark web UI at http://192.168.164.1:4040 17/02/16 17:17:37 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped! 17/02/16 17:17:37 INFO MemoryStore: MemoryStore cleared 17/02/16 17:17:37 INFO BlockManager: BlockManager stopped 17/02/16 17:17:37 INFO BlockManagerMaster: BlockManagerMaster stopped 17/02/16 17:17:37 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped! 17/02/16 17:17:37 INFO SparkContext: Successfully stopped SparkContext 17/02/16 17:17:37 INFO ShutdownHookManager: Shutdown hook called 17/02/16 17:17:37 INFO ShutdownHookManager: Deleting directory C:\Users\Administrator.WIN-20160809ARI\AppData\Local\Temp\spark-2027dd67-acbd-4a33-a140-6e978a86a839 17/02/16 17:17:37 INFO RemoteActorRefProvider$RemotingTerminator: Shutting down remote daemon. 17/02/16 17:17:37 INFO RemoteActorRefProvider$RemotingTerminator: Remote daemon shut down; proceeding with flushing remote transports. Process finished with exit code 0

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值