package com.vic.crm.ifib.recom
import org.apache.log4j.Level
import org.apache.log4j.Logger
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
import org.apache.spark.mllib.recommendation.Rating
import org.apache.spark.rdd.RDD
import org.jblas.DoubleMatrix
/**
* 用户理财产品推荐
*/
object Ifib {
//修改spark日志级别
Logger.getLogger("org").setLevel(Level.ERROR)
val logger = Logger.getLogger("Ifib")
/** 校验集预测数据和实际数据之间的均方根误差 **/
def computeRmse(model: MatrixFactorizationModel, data: RDD[Rating]): Double = {
val predictions: RDD[Rating] = model.predict(data.map(x => (x.user, x.product)))
val predictionsAndRatings = predictions.map(x => ((x.user, x.product), x.rating))
.join(data.map(x => ((x.user, x.product), x.rating)))
.values
math.sqrt(predictionsAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).reduce(_ + _) / data.count)
}
def main(args: Array[String]) {
System.setProperty("hadoop.home.dir", "D:\\hadoop-2.2.0");
val conf = new SparkConf().setAppName("vic-ifib").setMaster("local")
val sc = new SparkContext(conf)
val rawData = sc.textFile("D:\\fp_cod_pid.txt").map(_.split("\\|")).map { case Array(userid, pid, productcod, bal) =>
val bal1 = bal match{
case bal if bal.toDouble>=10000 && bal.toDouble<50000 => 1.0
case bal if bal.toDouble>=50000 && bal.toDouble<100000 => 2.0
case bal if bal.toDouble>=100000 && bal.toDouble<1000000 => 3.0
case bal if bal.toDouble>=1000000 && bal.toDouble<5000000 => 4.0
case bal if bal.toDouble>=5000000 =>5.0
case _ => 0.0
}
(productcod, (pid, bal1)) }.filter(x => x._1 != "" && x._2._1.charAt(12) == '0')
val productData = sc.textFile("D:\\fp_cod_id.txt").map(_.split("\\|")).map { case Array(productid, productcod) => (productcod, productid) }
val pid_productid_bal = rawData.join(productData).map { case (productcod, ((pid, bal), productid)) => (pid, (productid, bal)) }
val pid_userid = sc.textFile("D:\\userid_pid.txt").map(_.split("\\|")).map { case Array(userid, pid) => (pid, userid) }.filter(x => x._1 != "" && x._2.toInt >= 0)
val ratings = pid_productid_bal.join(pid_userid).map { case (pid, ((productid, bal), userid)) => (userid.toInt, productid.toInt, bal) }.filter(x => x._1 > 0 && x._2 > 0 && x._3 > 0).map { case (userid, productid, bal) =>Rating(userid, productid, bal) }
//拆分数据集生成训练数据集和测试数据集
val splits = ratings.randomSplit(Array( 0.8 , 0.2 ), seed = 111l)
val training = splits(0).repartition(4)
val test = splits(1).repartition(4)
val model = ALS.train(training, 50, 10, 0.01)
//计算测试集的均方差
//logger.info("在测试集上进行的预测与实际值的均方差:"+computeRmse(model,test))
val fp_cod_id = sc.textFile("D:\\fp_cod_id.txt").map(_.split("\\|")).map { case Array(id, code) => (code, id) }
val fp_cod_name = sc.textFile("D:\\fp_cod_name.txt").map(_.split("\\|\\!")).map { case Array(code, name) => (code, name) }
//理财产品id-name对照map
val fp_id_name = fp_cod_id.join(fp_cod_name).map { case (code, (id, name)) => (id, name) }.collectAsMap()
val ifibForUser = ratings.keyBy(_.user).lookup(242)
/*logger.info("242用户实际购买了" + ifibForUser.size + "种理财产品:")
ifibForUser.sortBy(-_.rating).take(10).map(rating => (fp_id_name.get(rating.product.toString).get, rating.rating)).foreach(x => logger.info(x._1 + "," + x._2))
logger.info("系统推荐给242用户前5种产品:")
val topKRecs = model.recommendProducts(242, 5).map(rating => ((fp_id_name).get(rating.product.toString).get, rating.rating));
topKRecs.foreach(x => logger.info(x._1 + "," + x._2))
logger.info("系统推荐给242用户前5种产品(除去用户已经购买的产品):")
val myRatedProducts = ifibForUser.map(_.product ).toSet
val product_id_cod = productData.map{ case (productcod, productid) => (productid.toInt, productcod)}.collectAsMap
val candidates = sc.parallelize(product_id_cod.keys.filter(!myRatedProducts.contains(_)).toSeq)
model.predict(candidates.map((242, _)))
.collect()
.sortBy(-_.rating)
.take(5).foreach(r => logger.info(fp_id_name.get(r.product.toString).get +"," + r.rating ))*/
//计算APK
/*val actualIfib = ifibForUser.map(_.product )
println(actualIfib)
val predictedIfib = model.recommendProducts(242, 5).map(_.product)
println(predictedIfib)
val apk5 = avgPrecisionK(actualIfib, predictedIfib, 5)
println(apk5)*/
val itemFactors = model.productFeatures.map{case (id,factor) => factor }.collect()
println("itemFactors:"+itemFactors.take(5).foreach(x => println(x.mkString(","))))
val itemMatrix = new DoubleMatrix(itemFactors)
logger.info("productFeatures " + itemMatrix.rows + ","+itemMatrix.columns )
val imBoradcase = sc.broadcast(itemMatrix)
val allRecs = model.userFeatures.map{ case (userId,array) =>
val userVector = new DoubleMatrix(array)
val scores = imBoradcase.value.mmul(userVector)
val sortedWithId = scores.data.zipWithIndex.sortBy(-_._1)
val recommendedIds = sortedWithId.map(_._2 + 1).toSeq
(userId,recommendedIds)
}
val userProducts = ratings.map{ case Rating(user,product,rating) => (user,product)}.groupBy(_._1 )
val MAPK = allRecs.join(userProducts).map{ case (userId,(predicted,actualWithIds)) =>
val actual = actualWithIds.map(_._2).toSeq
avgPrecisionK(actual, predicted, 5)
}.reduce(_+_) / allRecs.count
println("MAPK = "+MAPK)
}
def avgPrecisionK(actual: Seq[Int],predicted: Seq[Int],k:Int):Double = {
val predK = predicted.take(k)
var score = 0.0
var numHits = 0.0
for((p,i) <- predK.zipWithIndex){
if(actual.contains(p)){
numHits += 1.0
score += numHits / (i.toDouble + 1.0)
}
}
if(actual.isEmpty){
1.0
}else{
score / scala.math.min(actual.size, k).toDouble
}
}
}
spark mllib 协同过滤
最新推荐文章于 2021-10-25 20:30:00 发布