:scala版算法实现
package com.bbw5.dataalgorithms.spark
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import scala.collection.mutable.HashMap
import scala.collection.mutable.ArrayBuffer
/**
* The SparkFriendRecommendation is a Spark program to implement a basic
* friends recommendation engine between all users.
* CWBTIAB.txt:
* user1,item1
* user1,item2
* user1,item3
* user1,item2
* user1,item3
* user2,item2
* user2,item4
* user2,item5
* user3,item1
* user3,item2
* user4,item4
* user4,item5
*
* FBT.txt
* T1:P1,P2,P3
* T2:P2,P3,P4
* T3:P4,P3,P8
* T4:P2,P3,P9,P10
* T5:P4,P3,P8,P12
* T6:P1,P3,P8
* T7:P2,P3,P8
*
* @author baibaiw5
*
*/
object SparkRecommendItems {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("SparkRecommendItems")
val sc = new SparkContext(sparkConf)
}
def customersWhoBoughtThisItemAlsoBought(sc: SparkContext) {
val textFiles = sc.textFile("G:/temp/data/CWBTIAB.txt");
val topN = sc.broadcast(2)
val data = textFiles.map { l => (l.split(",")(0), l.split(",")(1)) }.groupByKey().flatMap {
case (_, items) =>
items.map { item =>
val map = new HashMap[String, Int]()
items.foldLeft(map) { (m, a) =>
if (item != a) m(a) = m.getOrElse(a, 0) + 1
m
}
(item, map)
}
}
data.collect().foreach(println)
val data2 = data.groupByKey().mapValues { items =>
val map = new HashMap[String, Int]()
items.foldLeft(map) { (m, m2) =>
m2.foreach { case (k, v) => m(k) = m.getOrElse(k, 0) + v }
m
}
map
}
data2.collect().foreach(println)
//get top n recommend
val data3 = data2.mapValues(m => m.toList.sortBy(a => -a._2).take(topN.value))
data3.collect().foreach(println)
}
def frequentlyBoughtTogether(sc: SparkContext) {
val textFiles = sc.textFile("G:/temp/data/FBT.txt")
val supportB = sc.broadcast(2)
val data = textFiles.map { l => l.split("[:]")(1).split(",") }.flatMap { items =>
val buffer = new ArrayBuffer[Array[String]]
for (i <- 1 to items.size) {
//must be sorted
buffer ++= items.sorted.combinations(i)
}
buffer.map { a => (a.toList, 1) }
}
data.collect().foreach(println)
val data2 = data.reduceByKey((a, b) => a + b).filter(_._2 >= supportB.value)
data2.collect().foreach(println)
}
}