这个相对来说比较简单,核心代码如下:
大体的过程如下:
①获取话题的类型特征,格式如:topicid,categoryid
②相似度计算,采用欧式距离
然后相似度取1/(1+d12),也就是说距离越近的,相似度越趋近于1,话题类型接近。等于1则说明是同一个类型
try {
val deletePath1 = new Path(path)
val fs1 = FileSystem.get(URI.create(path), hadoopConf)
if (fs1.exists(deletePath1)) fs1.delete(deletePath1, true)
println("删除成功")
} catch {
case e: ArithmeticException => println(e)
}
//获取话题类型特征
val topicrdd1: RDD[Topic] = sparkContext.textFile("hdfs://hdp01:9000/recs/category/category_15079").map {
line =>
val fields = line.split("\\[")
val strings = fields(1).split("\\]")
val strings1 = strings(0).split(",")
Topic(strings1(0).toInt, strings1(1).toDouble)
}
val datardd: RDD[(Topic, Topic)] = topicrdd1.cartesian(topicrdd1)
val topicResc = datardd.filter { case (a, b) => a.id != b.id }
.map {
case (a, b) =>
val score = this.ojilideSim(a.CategoryId, b.CategoryId)
(a.id, (b.id, score))
}
.filter(_._2._2 > 0.5)
.sortByKey()
.map {
case (topicId, recs) =>
//topicId.toString + "\001" + recs._1
(topicId,recs._1)
}
val resultRdd= topicResc.groupByKey().map {
case (topicId, recs) =>
val recsarr: Array[Int] = recs.toArray
val recsResult: Array[Int] = new Array[Int](30)
for (i <- 0 until 29) {
val n = Random.nextInt(recsarr.length)
recsResult(i) = recsarr(n)
}
var string=""
for(i<- 0 until recsResult.length-1){
if(i==0){
string=recsResult(0).toString
}else{
string=string+","+recsResult(i).toString
}
}
topicId+"\001"+string
}
resultRdd.saveAsTextFile(path)
sparkContext.stop()
}
//欧式距离相似度计算
def ojilideSim(category1: Double, category2: Double): Double = {
val fenmu = math.sqrt(math.pow((category1 - category2), 2))
1 / (1 + fenmu)
}
③在hive中得到用户的推荐列表并且过滤掉用户已经相关联到的话题
用户与话题相关表:
话题类型相似表:
为了方便关联,在hive中进行行列转换将话题类型相似表转换成如下格式
create table user_recs_categroy_tmp as
select topicid,simtopic from category_result lateral view explode(simtopicid) tmp as simtopic;
然后运用Hive的不等值连接,关联用户相关话题,再过滤掉用户已经关联过的话题
select a.userinfoid,simtopicid from
(select userinfoid,simtopicid from user_recs_categroy_tmp1)a
left join
(select userinfoid,topicid from recs_userRalationTopic)b
on
a.simtopicid=b.topicid and a.userinfoid=b.userinfoid where b.topicid is null;
最后在上述表中随机抽取5个做为推荐结果
select userinfoid,concat_ws(',',collect_set(simtopicid)) simtopics
from
(select a1.userinfoid,a1.simtopicid
from (select a1.userinfoid,a1.simtopicid,row_number()over(partition by a1.userinfoid order by rand()) rn
from user_recs_categroy a1) a1
where rn<=5)a2
group by userinfoid;
④在这里,随机抽取一个看一下效果
用户162194关联的话题有:
可以发现,用户关联的话题中17349的类型标签为6
给其推荐的话题有18040,18289等
可见,召回率还是相当准确的。