spark mllib 协同过滤

package com.vic.crm.ifib.recom

import org.apache.log4j.Level
import org.apache.log4j.Logger
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
import org.apache.spark.mllib.recommendation.Rating
import org.apache.spark.rdd.RDD
import org.jblas.DoubleMatrix

/**
 * 用户理财产品推荐
 */
object Ifib {
  //修改spark日志级别
  Logger.getLogger("org").setLevel(Level.ERROR)

  val logger = Logger.getLogger("Ifib")
  /** 校验集预测数据和实际数据之间的均方根误差 **/
  def computeRmse(model: MatrixFactorizationModel, data: RDD[Rating]): Double = {
    val predictions: RDD[Rating] = model.predict(data.map(x => (x.user, x.product)))
    val predictionsAndRatings = predictions.map(x => ((x.user, x.product), x.rating))
      .join(data.map(x => ((x.user, x.product), x.rating)))
      .values
    math.sqrt(predictionsAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).reduce(_ + _) / data.count)
  }
  
  def main(args: Array[String]) {
    System.setProperty("hadoop.home.dir", "D:\\hadoop-2.2.0");
    val conf = new SparkConf().setAppName("vic-ifib").setMaster("local")
    val sc = new SparkContext(conf)
    val rawData = sc.textFile("D:\\fp_cod_pid.txt").map(_.split("\\|")).map { case Array(userid, pid, productcod, bal) => 
      val bal1 = bal match{
        case bal if bal.toDouble>=10000 && bal.toDouble<50000 => 1.0
		case bal if bal.toDouble>=50000 && bal.toDouble<100000 => 2.0
		case bal if bal.toDouble>=100000 && bal.toDouble<1000000 => 3.0
		case bal if bal.toDouble>=1000000 && bal.toDouble<5000000 => 4.0
		case bal if bal.toDouble>=5000000 =>5.0
		case _ => 0.0
      }
      (productcod, (pid, bal1)) }.filter(x => x._1 != "" && x._2._1.charAt(12) == '0')
    val productData = sc.textFile("D:\\fp_cod_id.txt").map(_.split("\\|")).map { case Array(productid, productcod) => (productcod, productid) }
    val pid_productid_bal = rawData.join(productData).map { case (productcod, ((pid, bal), productid)) => (pid, (productid, bal)) }

    val pid_userid = sc.textFile("D:\\userid_pid.txt").map(_.split("\\|")).map { case Array(userid, pid) => (pid, userid) }.filter(x => x._1 != "" && x._2.toInt >= 0)

    val ratings = pid_productid_bal.join(pid_userid).map { case (pid, ((productid, bal), userid)) => (userid.toInt, productid.toInt, bal) }.filter(x => x._1 > 0 && x._2 > 0 && x._3 > 0).map { case (userid, productid, bal) =>Rating(userid, productid, bal) }

    //拆分数据集生成训练数据集和测试数据集
    val splits = ratings.randomSplit(Array( 0.8 , 0.2 ), seed = 111l)
    val training = splits(0).repartition(4) 
    val test = splits(1).repartition(4) 
    
    val model = ALS.train(training, 50, 10, 0.01)
    
    
    //计算测试集的均方差
    //logger.info("在测试集上进行的预测与实际值的均方差:"+computeRmse(model,test))
    

    val fp_cod_id = sc.textFile("D:\\fp_cod_id.txt").map(_.split("\\|")).map { case Array(id, code) => (code, id) }
    val fp_cod_name = sc.textFile("D:\\fp_cod_name.txt").map(_.split("\\|\\!")).map { case Array(code, name) => (code, name) }

    //理财产品id-name对照map
    val fp_id_name = fp_cod_id.join(fp_cod_name).map { case (code, (id, name)) => (id, name) }.collectAsMap()

    val ifibForUser = ratings.keyBy(_.user).lookup(242)
    /*logger.info("242用户实际购买了" + ifibForUser.size + "种理财产品:")
    ifibForUser.sortBy(-_.rating).take(10).map(rating => (fp_id_name.get(rating.product.toString).get, rating.rating)).foreach(x => logger.info(x._1 + "," + x._2))

    logger.info("系统推荐给242用户前5种产品:")
    val topKRecs = model.recommendProducts(242, 5).map(rating => ((fp_id_name).get(rating.product.toString).get, rating.rating));
    topKRecs.foreach(x => logger.info(x._1 + "," + x._2))
    
    logger.info("系统推荐给242用户前5种产品(除去用户已经购买的产品):")
    val myRatedProducts = ifibForUser.map(_.product ).toSet
    val product_id_cod = productData.map{ case (productcod, productid) => (productid.toInt, productcod)}.collectAsMap
    val candidates = sc.parallelize(product_id_cod.keys.filter(!myRatedProducts.contains(_)).toSeq)
    model.predict(candidates.map((242, _)))
      .collect()
      .sortBy(-_.rating)
      .take(5).foreach(r => logger.info(fp_id_name.get(r.product.toString).get +"," + r.rating ))*/
    
    //计算APK
    
    /*val actualIfib = ifibForUser.map(_.product )
    println(actualIfib)
    val predictedIfib = model.recommendProducts(242, 5).map(_.product)
    println(predictedIfib)
    val apk5 = avgPrecisionK(actualIfib, predictedIfib, 5)
    println(apk5)*/
    
    val itemFactors = model.productFeatures.map{case (id,factor) => factor }.collect()
    println("itemFactors:"+itemFactors.take(5).foreach(x => println(x.mkString(","))))
    val itemMatrix = new DoubleMatrix(itemFactors)
    logger.info("productFeatures " + itemMatrix.rows + ","+itemMatrix.columns )
    val imBoradcase = sc.broadcast(itemMatrix)
    val allRecs = model.userFeatures.map{ case (userId,array) =>
    	val userVector = new DoubleMatrix(array)
    	val scores = imBoradcase.value.mmul(userVector)
    	val sortedWithId = scores.data.zipWithIndex.sortBy(-_._1)
    	val recommendedIds = sortedWithId.map(_._2 + 1).toSeq
    	(userId,recommendedIds)
    }
    
    val userProducts = ratings.map{ case Rating(user,product,rating) => (user,product)}.groupBy(_._1 )
    val MAPK = allRecs.join(userProducts).map{ case (userId,(predicted,actualWithIds)) =>
    	val actual = actualWithIds.map(_._2).toSeq
    	avgPrecisionK(actual, predicted, 5)
    }.reduce(_+_) / allRecs.count
    
    println("MAPK = "+MAPK)
  }
  
  def avgPrecisionK(actual: Seq[Int],predicted: Seq[Int],k:Int):Double = {
    val predK = predicted.take(k)
    var score = 0.0
    var numHits = 0.0
    for((p,i) <- predK.zipWithIndex){
      if(actual.contains(p)){
        numHits += 1.0
        score += numHits / (i.toDouble + 1.0)
      }
    }
    if(actual.isEmpty){
      1.0
    }else{
      score / scala.math.min(actual.size, k).toDouble
    }
  }
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值