/**
* 计算AUC评分
*
* @param positiveData 测试数据
* @param bAllArtistIDs 所有的艺术家ID
* @param predictFunction model.transform
* @return 评分 0-1
*/
def areaUnderCurve(positiveData: DataFrame,
bAllArtistIDs: Broadcast[Array[Int]],
predictFunction: DataFrame => DataFrame): Double = {
import positiveData.sparkSession.implicits._
// What this actually computes is AUC, per user. The result is actually something
// that might be called "mean AUC".
// Take held-out data as the "positive".
// Make predictions for each of them, including a numeric score
val positivePredictions = predictFunction(positiveData.select("user", "artist")).
withColumnRenamed("prediction", "positivePrediction")
// BinaryClassificationMetrics.areaUnderROC is not used here since there are really lots of
// small AUC problems, and it would be inefficient, when a direct computation is available.
// Create a set of "negative" products for each user. These are randomly chosen
// from among all of the other artists, excluding those that are "positive" for the user.
val negativeData = positiveData.select("user", "artist").as[(Int, Int)].
groupByKey { case (user, _) => user }.
flatMapGroups { case (userID, userIDAndPosArtistIDs) =>
val random = new Random()
val posItemIDSet = userIDAndPosArtistIDs.map { case (_, artist) => artist }.toSet
val negative = new ArrayBuffer[Int]()
val allArtistIDs = bAllArtistIDs.value
var i = 0
// Make at most one pass over all artists to avoid an infinite loop.
// Also stop when number of negative equals positive set size
while (i < allArtistIDs.length && negative.size < posItemIDSet.size) {
val artistID = allArtistIDs(random.nextInt(allArtistIDs.length))
// Only add new distinct IDs
if (!posItemIDSet.contains(artistID)) {
negative += artistID
}
i += 1
}
// Return the set with user ID added back
negative.map(artistID => (userID, artistID))
}.toDF("user", "artist")
// Make predictions on the rest:
val negativePredictions = predictFunction(negativeData).
withColumnRenamed("prediction", "negativePrediction")
// Join positive predictions to negative predictions by user, only.
// This will result in a row for every possible pairing of positive and negative
// predictions within each user.
val joinedPredictions = positivePredictions.join(negativePredictions, "user").
select("user", "positivePrediction", "negativePrediction").cache()
// Count the number of pairs per user
val allCounts = joinedPredictions.
groupBy("user").agg(count(lit("1")).as("total")).
select("user", "total")
// Count the number of correctly ordered pairs per user
val correctCounts = joinedPredictions.
filter($"positivePrediction" > $"negativePrediction").
groupBy("user").agg(count("user").as("correct")).
select("user", "correct")
// Combine these, compute their ratio, and average over all users
val meanAUC = allCounts.join(correctCounts, Seq("user"), "left_outer").
select($"user", (coalesce($"correct", lit(0)) / $"total").as("auc")).
agg(mean("auc")).
as[Double].first()
joinedPredictions.unpersist()
meanAUC
}
合并网格搜索+AUC评分的调用
// 艺术家id数据,用于AUC评分
val allArtistIds = allDF.select("artist").as[Int].distinct().collect()
val bAllArtistIds = spark.sparkContext.broadcast(allArtistIds)
// 网格搜索
val evaluations =
// 利用for循环,生成不同的超参数配置
for (rank <- Seq(5, 30);
regParam <- Seq(4.0, 0.0001);
alpha <- Seq(1.0, 40.0))
yield {
// 构建模型
val als = new ALS()
.setSeed(Random.nextLong())
.setImplicitPrefs(true)
.setRank(rank)
.setRegParam(regParam)
.setAlpha(alpha)
.setMaxIter(5)
.setUserCol("user")
.setItemCol("artist")
.setRatingCol("count")
.setPredictionCol("prediction")
// 训练模型
val model = als.fit(trainDF)
// 进行AUC评分
val auc = areaUnderCurve(testDF, bAllArtistIds, model.transform)
// 释放资源
model.userFactors.unpersist()
model.itemFactors.unpersist()
(auc, (rank, regParam, alpha))
}
evaluations.sorted.reverse.foreach(println)
package com.skey.analytics.ch03
import org.apache.spark.SparkConf
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.ml.recommendation.{ALS, ALSModel}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import scala.collection.mutable.ArrayBuffer
import scala.util.Random
/**
* 第三章 - 音乐推荐
*
* @author ALion
* @version 2019/2/24 10:53
*/
object Recommender {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[4]")
.setAppName("RecommenderApp")
val spark = new SparkSession.Builder()
.config(conf)
.enableHiveSupport()
.getOrCreate()
import spark.implicits._
//artist_data.txt
//2 columns: artistid artist_name
val path1 = "./profiledata_06-May-2005/artist_data.txt"
val rawArtistData = spark.read.textFile(path1)
val artistIdDF = transformArtistData(rawArtistData)
//artist_alias.txt
//2 columns: badid, goodid
val path2 = "./profiledata_06-May-2005/artist_alias.txt"
val rawAliasData = spark.read.textFile(path2)
val artistAlias = transformAliasData(rawAliasData).collect().toMap
//user_artist_data.txt
//3 columns: userid artistid playcount
val path0 = "./profiledata_06-May-2005/user_artist_data.txt"
val rawUserArtistData = spark.read.textFile(path0)
val allDF = transformUserArtistData(spark, rawUserArtistData, artistAlias)
allDF.persist()
// 拆分训练集和测试集
val Array(trainDF, testDF) = allDF.randomSplit(Array(0.9, 0.1))
trainDF.persist()
// // 构建模型
// val als = new ALS()
// .setSeed(Random.nextLong())
// .setImplicitPrefs(true)
// .setRank(30)
// .setRegParam(0.0001)
// .setAlpha(1.0)
// .setMaxIter(5)
// .setUserCol("user")
// .setItemCol("artist")
// .setRatingCol("count")
// .setPredictionCol("prediction")
//
// // 训练模型
// val model = als.fit(trainDF)
//
// // 释放缓存资源
// trainDF.unpersist()
//
// // 开始推荐
//
// // 准备需要推荐的用户
// val someUsers = testDF.select("user").as[Int].take(10).distinct
//
// someUsers.map { user =>
// // 推荐
// val recommendDF = recommend(model, user, 5, artistIdDF)
// val strings = recommendDF.map(_.mkString("|")).collect()
//
// (user, strings.toBuffer)
// }.foreach(println)
// 艺术家id数据,用于AUC评分
val allArtistIds = allDF.select("artist").as[Int].distinct().collect()
val bAllArtistIds = spark.sparkContext.broadcast(allArtistIds)
// 网格搜索
val evaluations =
// 利用for循环,生成不同的超参数配置
for (rank <- Seq(5, 30);
regParam <- Seq(4.0, 0.0001);
alpha <- Seq(1.0, 40.0))
yield {
// 构建模型
val als = new ALS()
.setSeed(Random.nextLong())
.setImplicitPrefs(true)
.setRank(rank)
.setRegParam(regParam)
.setAlpha(alpha)
.setMaxIter(5)
.setUserCol("user")
.setItemCol("artist")
.setRatingCol("count")
.setPredictionCol("prediction")
val model = als.fit(trainDF)
val auc = areaUnderCurve(testDF, bAllArtistIds, model.transform)
// 释放资源
model.userFactors.unpersist()
model.itemFactors.unpersist()
(auc, (rank, regParam, alpha))
}
evaluations.sorted.reverse.foreach(println)
//(0.9039124436650243,(30,1.0E-4,1.0))
//(0.9034269912559532,(5,1.0E-4,1.0))
//(0.9032449249724098,(30,1.0E-4,40.0))
//(0.9028574761056848,(30,4.0,1.0))
//(0.9019663966459797,(5,1.0E-4,40.0))
//(0.9017698705975027,(30,4.0,40.0))
//(0.9015351771563618,(5,4.0,40.0))
//(0.9011632951254114,(5,4.0,1.0))
spark.stop()
}
/**
* 合并数据,创建一个总的数据集
*
* @param spark SparkSession
* @param rawUserArtistDS 用户和艺术家的关系数据集
* @param artistAlias 艺术家别名id,用于补全
* @return
*/
def transformUserArtistData(spark: SparkSession, rawUserArtistDS: Dataset[String], artistAlias: Map[Int, Int]): DataFrame = {
import spark.implicits._
val bArtistAlias = spark.sparkContext.broadcast(artistAlias)
rawUserArtistDS.map(line => {
val Array(userId, artistId, count) = line.split(' ').map(_.toInt)
val finalArtistId = bArtistAlias.value.getOrElse(artistId, artistId)
(userId, finalArtistId, count)
}).toDF("user", "artist", "count")
}
def transformArtistData(rawArtistData: Dataset[String]): DataFrame = {
import rawArtistData.sparkSession.implicits._
rawArtistData.flatMap(line => {
val (id, name) = line.span(_ != '\t')
try {
if (name.nonEmpty)
Some(id.toInt, name.trim)
else
None
} catch {
case _: Exception => None
}
}).toDF("id", "name")
}
def transformAliasData(rawAliasData: Dataset[String]): Dataset[(Int, Int)] = {
import rawAliasData.sparkSession.implicits._
rawAliasData.flatMap(line => {
val Array(artist, alias) = line.split('\t')
try {
if (artist.nonEmpty)
Some(artist.toInt, alias.toInt)
else
None
} catch {
case _: Exception => None
}
})
}
/**
* 为指定用户推荐艺术家
*
* @param model 训练好的ALS模型
* @param userId 用户id
* @param howMany 推荐多少个艺术家
* @param artistIdDF 艺术家id和名称的关系映射
* @return
*/
def recommend(model: ALSModel, userId: Int, howMany: Int, artistIdDF: DataFrame): DataFrame = {
import artistIdDF.sparkSession.implicits._
val toRecommend = model.itemFactors
.select($"id".as("artist"))
.withColumn("user", lit(userId))
val topRecommendtions = model.transform(toRecommend)
.select("artist", "prediction")
.orderBy($"prediction".desc)
.limit(howMany)
// 得到需要推荐的艺术家的id
val recommendedArtistIds = topRecommendtions.select("artist").as[Int].collect()
artistIdDF.filter($"id" isin (recommendedArtistIds: _*))
}
/**
* 计算AUC评分
*
* @param positiveData 测试数据
* @param bAllArtistIDs 所有的艺术家ID
* @param predictFunction model.transform
* @return 评分 0-1
*/
def areaUnderCurve(positiveData: DataFrame,
bAllArtistIDs: Broadcast[Array[Int]],
predictFunction: DataFrame => DataFrame): Double = {
import positiveData.sparkSession.implicits._
// What this actually computes is AUC, per user. The result is actually something
// that might be called "mean AUC".
// Take held-out data as the "positive".
// Make predictions for each of them, including a numeric score
val positivePredictions = predictFunction(positiveData.select("user", "artist")).
withColumnRenamed("prediction", "positivePrediction")
// BinaryClassificationMetrics.areaUnderROC is not used here since there are really lots of
// small AUC problems, and it would be inefficient, when a direct computation is available.
// Create a set of "negative" products for each user. These are randomly chosen
// from among all of the other artists, excluding those that are "positive" for the user.
val negativeData = positiveData.select("user", "artist").as[(Int, Int)].
groupByKey { case (user, _) => user }.
flatMapGroups { case (userID, userIDAndPosArtistIDs) =>
val random = new Random()
val posItemIDSet = userIDAndPosArtistIDs.map { case (_, artist) => artist }.toSet
val negative = new ArrayBuffer[Int]()
val allArtistIDs = bAllArtistIDs.value
var i = 0
// Make at most one pass over all artists to avoid an infinite loop.
// Also stop when number of negative equals positive set size
while (i < allArtistIDs.length && negative.size < posItemIDSet.size) {
val artistID = allArtistIDs(random.nextInt(allArtistIDs.length))
// Only add new distinct IDs
if (!posItemIDSet.contains(artistID)) {
negative += artistID
}
i += 1
}
// Return the set with user ID added back
negative.map(artistID => (userID, artistID))
}.toDF("user", "artist")
// Make predictions on the rest:
val negativePredictions = predictFunction(negativeData).
withColumnRenamed("prediction", "negativePrediction")
// Join positive predictions to negative predictions by user, only.
// This will result in a row for every possible pairing of positive and negative
// predictions within each user.
val joinedPredictions = positivePredictions.join(negativePredictions, "user").
select("user", "positivePrediction", "negativePrediction").cache()
// Count the number of pairs per user
val allCounts = joinedPredictions.
groupBy("user").agg(count(lit("1")).as("total")).
select("user", "total")
// Count the number of correctly ordered pairs per user
val correctCounts = joinedPredictions.
filter($"positivePrediction" > $"negativePrediction").
groupBy("user").agg(count("user").as("correct")).
select("user", "correct")
// Combine these, compute their ratio, and average over all users
val meanAUC = allCounts.join(correctCounts, Seq("user"), "left_outer").
select($"user", (coalesce($"correct", lit(0)) / $"total").as("auc")).
agg(mean("auc")).
as[Double].first()
joinedPredictions.unpersist()
meanAUC
}
}