package com.sdcet import org.apache.spark.{SparkConf, SparkContext} /** * Created by Administrator on 2017/2/16. */ object TestColl { System.setProperty("hadoop.home.dir", "E:\\winutils-hadoop-2.6.4\\hadoop-2.6.4") val conf = new SparkConf().setMaster("local").setAppName(this.getClass().getSimpleName().filter(!_.equals('$'))) println(this.getClass().getSimpleName().filter(!_.equals('$'))) //设置环境变量 val sc = new SparkContext(conf) //实例化环境 val users = sc.parallelize(Array("aaa", "bbb", "ccc", "ddd", "eee")) //设置用户 val films = sc.parallelize(Array("smzdm", "ylxb", "znh", "nhsc", "fcwr")) //设置电影名 var source = Map[String, Map[String, Int]]() //使用一个source嵌套map作为姓名电影名和分值的存储 val filmSource = Map[String, Int]() //设置一个用以存放电影分的map def getSource(): Map[String, Map[String, Int]] = { //设置电影评分 val user1FilmSource = Map("smzdm" -> 2, "ylxb" -> 3, "znh" -> 1, "nhsc" -> 0, "fcwr" -> 1) val user2FilmSource = Map("smzdm" -> 1, "ylxb" -> 2, "znh" -> 2, "nhsc" -> 1, "fcwr" -> 4) val user3FilmSource = Map("smzdm" -> 2, "ylxb" -> 1, "znh" -> 0, "nhsc" -> 1, "fcwr" -> 4) val user4FilmSource = Map("smzdm" -> 3, "ylxb" -> 2, "znh" -> 0, "nhsc" -> 5, "fcwr" -> 3) val user5FilmSource = Map("smzdm" -> 5, "ylxb" -> 3, "znh" -> 1, "nhsc" -> 1, "fcwr" -> 2) source += ("aaa" -> user1FilmSource) //对人名进行存储 source += ("bbb" -> user2FilmSource) //对人名进行存储 source += ("ccc" -> user3FilmSource) //对人名进行存储 source += ("ddd" -> user4FilmSource) //对人名进行存储 source += ("eee" -> user5FilmSource) //对人名进行存储 source //返回嵌套map } //两两计算分值,采用余弦相似性 def getCollaborateSource(user1: String, user2: String): Double = { val user1FilmSource = source.get(user1).get.values.toVector //获得第1个用户的评分 val user2FilmSource = source.get(user2).get.values.toVector //获得第2个用户的评分 val member = user1FilmSource.zip(user2FilmSource).map(d => d._1 * d._2).reduce(_ + _).toDouble //对公式分子部分进行计算 val temp1 = math.sqrt(user1FilmSource.map(num => { //求出分母第1个变量值 math.pow(num, 2) //数学计算 }).reduce(_ + _)) //进行叠加 println("temp1:" + temp1) val temp2 = math.sqrt(user2FilmSource.map(num => { 求出分母第2个变量值 math.pow(num, 2) //数学计算 }).reduce(_ + _)) //进行叠加 val denominator = temp1 * temp2 //求出分母 member / denominator //进行计算 } def main(args: Array[String]) { System.setProperty("hadoop.home.dir", "E:\\winutils-hadoop-2.6.4\\hadoop-2.6.4") getSource() //初始化分数 var name = "bbb" //设定目标对象 users.foreach(user => { //迭代进行计算 println(name + " 相对于 " + user + "的相似性分数是:" + getCollaborateSource(name, user)) }) println() name = "aaa" users.foreach(user => { //迭代进行计算 println(name + " 相对于 " + user + "的相似性分数是:" + getCollaborateSource(name, user)) }) }}
Console执行过程:
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties 17/02/16 17:17:31 INFO SparkContext: Running Spark version 1.6.3 17/02/16 17:17:33 INFO SecurityManager: Changing view acls to: Administrator 17/02/16 17:17:33 INFO SecurityManager: Changing modify acls to: Administrator 17/02/16 17:17:33 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(Administrator); users with modify permissions: Set(Administrator) 17/02/16 17:17:34 INFO Utils: Successfully started service 'sparkDriver' on port 51564. 17/02/16 17:17:34 INFO Slf4jLogger: Slf4jLogger started 17/02/16 17:17:34 INFO Remoting: Starting remoting 17/02/16 17:17:35 INFO Remoting: Remoting started; listening on addresses :[akka.tcp://sparkDriverActorSystem@192.168.164.1:51578] 17/02/16 17:17:35 INFO Utils: Successfully started service 'sparkDriverActorSystem' on port 51578. 17/02/16 17:17:35 INFO SparkEnv: Registering MapOutputTracker 17/02/16 17:17:35 INFO SparkEnv: Registering BlockManagerMaster 17/02/16 17:17:35 INFO DiskBlockManager: Created local directory at C:\Users\Administrator.WIN-20160809ARI\AppData\Local\Temp\blockmgr-97dae514-429b-4b5e-8850-3526d89da8b5 17/02/16 17:17:35 INFO MemoryStore: MemoryStore started with capacity 1807.0 MB 17/02/16 17:17:35 INFO SparkEnv: Registering OutputCommitCoordinator 17/02/16 17:17:36 INFO Utils: Successfully started service 'SparkUI' on port 4040. 17/02/16 17:17:36 INFO SparkUI: Started SparkUI at http://192.168.164.1:4040 17/02/16 17:17:36 INFO Executor: Starting executor ID driver on host localhost 17/02/16 17:17:36 INFO Utils: Successfully started service 'org.apache.spark.network.netty.NettyBlockTransferService' on port 51587. 17/02/16 17:17:36 INFO NettyBlockTransferService: Server created on 51587 17/02/16 17:17:36 INFO BlockManagerMaster: Trying to register BlockManager 17/02/16 17:17:36 INFO BlockManagerMasterEndpoint: Registering block manager localhost:51587 with 1807.0 MB RAM, BlockManagerId(driver, localhost, 51587) 17/02/16 17:17:36 INFO BlockManagerMaster: Registered BlockManager 17/02/16 17:17:37 INFO SparkContext: Starting job: foreach at TestColl.scala:60 17/02/16 17:17:37 INFO DAGScheduler: Got job 0 (foreach at TestColl.scala:60) with 1 output partitions 17/02/16 17:17:37 INFO DAGScheduler: Final stage: ResultStage 0 (foreach at TestColl.scala:60) 17/02/16 17:17:37 INFO DAGScheduler: Parents of final stage: List() 17/02/16 17:17:37 INFO DAGScheduler: Missing parents: List() 17/02/16 17:17:37 INFO DAGScheduler: Submitting ResultStage 0 (ParallelCollectionRDD[0] at parallelize at TestColl.scala:14), which has no missing parents 17/02/16 17:17:37 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 1248.0 B, free 1807.0 MB) 17/02/16 17:17:37 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 877.0 B, free 1807.0 MB) 17/02/16 17:17:37 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on localhost:51587 (size: 877.0 B, free: 1807.0 MB) 17/02/16 17:17:37 INFO SparkContext: Created broadcast 0 from broadcast at DAGScheduler.scala:1006 17/02/16 17:17:37 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 0 (ParallelCollectionRDD[0] at parallelize at TestColl.scala:14) 17/02/16 17:17:37 INFO TaskSchedulerImpl: Adding task set 0.0 with 1 tasks 17/02/16 17:17:37 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, localhost, partition 0,PROCESS_LOCAL, 2110 bytes) 17/02/16 17:17:37 INFO Executor: Running task 0.0 in stage 0.0 (TID 0) temp1:5.0990195135927845 bbb 相对于 aaa的相似性分数是:0.7089175569585667 temp1:5.0990195135927845 bbb 相对于 bbb的相似性分数是:1.0000000000000002 temp1:5.0990195135927845 bbb 相对于 ccc的相似性分数是:0.8780541105074453 temp1:5.0990195135927845 bbb 相对于 ddd的相似性分数是:0.6865554812287477 temp1:5.0990195135927845 bbb 相对于 eee的相似性分数是:0.6821910402406466 17/02/16 17:17:37 INFO Executor: Finished task 0.0 in stage 0.0 (TID 0). 915 bytes result sent to driver 17/02/16 17:17:37 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 151 ms on localhost (1/1) 17/02/16 17:17:37 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool 17/02/16 17:17:37 INFO DAGScheduler: ResultStage 0 (foreach at TestColl.scala:60) finished in 0.181 s 17/02/16 17:17:37 INFO DAGScheduler: Job 0 finished: foreach at TestColl.scala:60, took 0.514896 s 17/02/16 17:17:37 INFO SparkContext: Starting job: foreach at TestColl.scala:66 17/02/16 17:17:37 INFO DAGScheduler: Got job 1 (foreach at TestColl.scala:66) with 1 output partitions 17/02/16 17:17:37 INFO DAGScheduler: Final stage: ResultStage 1 (foreach at TestColl.scala:66) 17/02/16 17:17:37 INFO DAGScheduler: Parents of final stage: List() 17/02/16 17:17:37 INFO DAGScheduler: Missing parents: List() 17/02/16 17:17:37 INFO DAGScheduler: Submitting ResultStage 1 (ParallelCollectionRDD[0] at parallelize at TestColl.scala:14), which has no missing parents 17/02/16 17:17:37 INFO MemoryStore: Block broadcast_1 stored as values in memory (estimated size 1248.0 B, free 1807.0 MB) 17/02/16 17:17:37 INFO MemoryStore: Block broadcast_1_piece0 stored as bytes in memory (estimated size 876.0 B, free 1807.0 MB) 17/02/16 17:17:37 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on localhost:51587 (size: 876.0 B, free: 1807.0 MB) 17/02/16 17:17:37 INFO SparkContext: Created broadcast 1 from broadcast at DAGScheduler.scala:1006 17/02/16 17:17:37 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 1 (ParallelCollectionRDD[0] at parallelize at TestColl.scala:14) 17/02/16 17:17:37 INFO TaskSchedulerImpl: Adding task set 1.0 with 1 tasks 17/02/16 17:17:37 INFO TaskSetManager: Starting task 0.0 in stage 1.0 (TID 1, localhost, partition 0,PROCESS_LOCAL, 2110 bytes) 17/02/16 17:17:37 INFO Executor: Running task 0.0 in stage 1.0 (TID 1) temp1:3.872983346207417 aaa 相对于 aaa的相似性分数是:0.9999999999999999 temp1:3.872983346207417 aaa 相对于 bbb的相似性分数是:0.7089175569585667 temp1:3.872983346207417 aaa 相对于 ccc的相似性分数是:0.6055300708194983 temp1:3.872983346207417 aaa 相对于 ddd的相似性分数是:0.564932682866032 temp1:3.872983346207417 aaa 相对于 eee的相似性分数是:0.8981462390204985 17/02/16 17:17:37 INFO Executor: Finished task 0.0 in stage 1.0 (TID 1). 915 bytes result sent to driver 17/02/16 17:17:37 INFO TaskSetManager: Finished task 0.0 in stage 1.0 (TID 1) in 18 ms on localhost (1/1) 17/02/16 17:17:37 INFO DAGScheduler: ResultStage 1 (foreach at TestColl.scala:66) finished in 0.018 s 17/02/16 17:17:37 INFO TaskSchedulerImpl: Removed TaskSet 1.0, whose tasks have all completed, from pool 17/02/16 17:17:37 INFO DAGScheduler: Job 1 finished: foreach at TestColl.scala:66, took 0.033592 s 17/02/16 17:17:37 INFO SparkContext: Invoking stop() from shutdown hook 17/02/16 17:17:37 INFO SparkUI: Stopped Spark web UI at http://192.168.164.1:4040 17/02/16 17:17:37 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped! 17/02/16 17:17:37 INFO MemoryStore: MemoryStore cleared 17/02/16 17:17:37 INFO BlockManager: BlockManager stopped 17/02/16 17:17:37 INFO BlockManagerMaster: BlockManagerMaster stopped 17/02/16 17:17:37 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped! 17/02/16 17:17:37 INFO SparkContext: Successfully stopped SparkContext 17/02/16 17:17:37 INFO ShutdownHookManager: Shutdown hook called 17/02/16 17:17:37 INFO ShutdownHookManager: Deleting directory C:\Users\Administrator.WIN-20160809ARI\AppData\Local\Temp\spark-2027dd67-acbd-4a33-a140-6e978a86a839 17/02/16 17:17:37 INFO RemoteActorRefProvider$RemotingTerminator: Shutting down remote daemon. 17/02/16 17:17:37 INFO RemoteActorRefProvider$RemotingTerminator: Remote daemon shut down; proceeding with flushing remote transports. Process finished with exit code 0
Spark 相似度算法
最新推荐文章于 2023-02-20 18:33:39 发布