最近项目中用到ItemBased Collaborative Filtering,实践过spark mllib中的ALS,但是因为其中涉及到降维操作,大数据量的计算实在不能恭维。
所以自己实践实现基于spark的分布式cf,已经做了部分优化。目测运行效率还不错。以下代码
package model
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame}
import org.apache.spark.sql.hive.HiveContext
/**
* Created by dengxing on 2017/7/18.
*/
object CF {
/** 基于dt时间获取原始数据源
*
* @param sc SparkContext
* @param table 转换的hive表
* @param day 获取当前日期的数据
* @return 原始数据的dataFrame
*/
def getResource(sc: SparkContext, table: String, day: String) = {
val hiveContext = new HiveContext(sc)
import hiveContext.sql
val resource = sql("select "
+ "uid,"
+ "aid,"
+ "cnt"
+ " from " + table + " where dt ='" + day + "'")
resource
}
/**
* 分布式计算余弦相似度
* --------------------------------
* user1 user2
* item1 score11 score21 (X)
* item2 score12 score22 (Y)
* --------------------------------
* sim(item1,item2) = XY / math.sqrt(XX) * math.sqrt(YY)
* XY= score11 * score12 + score21 * score22
* XX = score11 * score11 + score21 * score21
* YY = score12 * score12 + score22 * score22
*
* @param resource
* @return RDD[(item1,item2,sim)]
*/
def getCosineSimilarity(resource: DataFrame): RDD[(String, (String, Double))] = {
val rating = resource.map {
row => {
val uid = row.getString(0)
val aid = row.getString(1)
val score = row.getString(2).toDouble
(uid, aid, s