标签相似度加权得分

qwerdf@QAQ

已于 2023-06-21 14:39:22 修改

阅读量66

点赞数

分类专栏：推荐系统文章标签： spark

于 2020-09-26 13:28:47 首次发布

本文链接：https://blog.csdn.net/weixin_43690478/article/details/108428383

版权

推荐系统专栏收录该内容

2 篇文章 0 订阅

订阅专栏

标签相似度加权得分

版本1

package sparkSQL

import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.{DataFrame, Row, SparkSession}

object TagSim {

  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder()
      .appName(this.getClass.getName)
      .master("local[2]")
      .getOrCreate()
    val sc = spark.sparkContext

    val themeInfoRDD = sc.textFile("F:\\ideaProjects\\spark-version2\\src\\main\\resources\\tagsim\\theme_info")
    val themeHeatInfoRDD = sc.textFile("F:\\ideaProjects\\spark-version2\\src\\main\\resources\\tagsim\\theme_heat_info")
    val themeCandidateRDD = sc.textFile("F:\\ideaProjects\\spark-version2\\src\\main\\resources\\tagsim\\theme_candidate_info")

    import spark.implicits._
    //获取主题资源数据集
    val themeInfoDF = themeInfoRDD.map {
      line => {
        val arr = line.split(" ")
        val theme_id = arr(0)
        val theme_ver = arr(1)
        val designer = arr(2)
        val tags = arr(3)
        (theme_id, theme_ver, designer, tags)
      }
    }.toDF("theme_id", "theme_ver", "designer", "tags")

    //获取主题资源候选数据集
    val themeCandidateDF = themeCandidateRDD.map {
      line => {
        val arr = line.split(" ")
        val rec_id = arr(0)
        val rec_ver = arr(1)
        val designer = arr(2)
        val tags = arr(3)
        (rec_id, rec_ver, designer, tags)
      }
    }.toDF("rec_id", "rec_ver", "designer", "tags")

    //获取主题资源热度数据集
    val themeHeatInfoDF = themeHeatInfoRDD.map {
      line => {
        val arr = line.split(" ")
        val theme_id = arr(0)
        val theme_ver = arr(1)
        val down_cnt = arr(2)
        (theme_id, theme_ver, down_cnt)
      }
    }.toDF("theme_id", "theme_ver", "down_cnt")

    //定义列的权重
    val colWeight = "designer:1,tags:2"

    val colWeightMap: Map[String, Double] = colWeight.split(",").map {
      x => {
        val arr = x.split(":")
        val colName = arr(0)
        val weight = arr(1).toDouble
        (colName,weight)
      }
    }.toMap

    import org.apache.spark.sql.functions._
    //将主题资源数据集和候选数据集按照版本关联，去掉同源资源
    val themeAndRecInfoDF = themeInfoDF.join(themeCandidateDF, themeInfoDF("theme_ver") === themeCandidateDF("rec_ver"), "left")
      .filter(themeInfoDF("theme_id") =!= themeCandidateDF("rec_id"))
      .select(themeInfoDF("theme_id"), themeInfoDF("theme_ver"), themeCandidateDF("rec_id"),themeCandidateDF("rec_ver"))

    themeAndRecInfoDF.show()

    var roundColWeightScoreDF = themeAndRecInfoDF.withColumn("score",lit(0.0)).select("theme_id","theme_ver","rec_id","score")

    //循环进行加权得分，计算标签相似度
    for ((c,w) <- colWeightMap) {
      roundColWeightScoreDF = roundColWeightScore(roundColWeightScoreDF,themeInfoDF.select("theme_id","theme_ver",c),w)
    }

    roundColWeightScoreDF.show()

    //考虑热度因素，进行最终得分计算
    val colWeightAndHeatScoreDF = colWeightAndHeatScore(roundColWeightScoreDF,themeHeatInfoDF)
    colWeightAndHeatScoreDF.show()
  }

  def roundColWeightScore(roundColWeightScoreDF: DataFrame, themeInfoDF: DataFrame, w: Double) = {
    import themeInfoDF.sparkSession.implicits._
    import org.apache.spark.sql.functions._
    var rank = roundColWeightScoreDF.join(themeInfoDF.toDF("theme_id", "theme_ver", "theme_tag"), Seq("theme_id", "theme_ver"))
      .join(themeInfoDF.toDF("rec_id", "theme_ver", "rec_tag"), Seq("rec_id", "theme_ver"))
      .select("theme_id", "theme_ver", "rec_id", "theme_tag", "rec_tag", "score")
      .map {
        case Row(theme_id: String, theme_ver: String, rec_id: String, theme_tag: String, rec_tag: String, score: Double) => {
          val themeSet = theme_tag.split("\\|").toSet
          val recSet = rec_tag.split("\\|").toSet
          var sim = 0.0
          if (!themeSet.isEmpty && !recSet.isEmpty) {
            sim = w * themeSet.intersect(recSet).size / themeSet.union(recSet).size
          }
          val new_score = score + sim
          (theme_id, theme_ver, rec_id, new_score)
        }
      }.toDF("theme_id", "theme_ver", "rec_id", "score")

    rank = rank.withColumn("_rank",
      row_number().over(Window.partitionBy("theme_id","theme_ver").orderBy(desc("score"))))
      .filter(col("_rank") <= 10)
        .drop("_rank")
        .select("theme_id", "theme_ver", "rec_id", "score")

    rank
  }

  def colWeightAndHeatScore(roundColWeightScoreDF: DataFrame, themeHeatInfoDF: DataFrame) = {
    import org.apache.spark.sql.functions._
    roundColWeightScoreDF.join(themeHeatInfoDF.toDF("rec_id","theme_ver", "down_cnt"),Seq("rec_id","theme_ver"))
      .select("theme_id", "theme_ver", "rec_id", "score", "down_cnt")
      .withColumn("_rank",
        row_number().over(Window.partitionBy(col("theme_id"), col("theme_ver")).orderBy(desc("score"), desc("down_cnt"))))
      .withColumn("new_score", col("_rank")/10)
      .selectExpr("theme_id", "theme_ver", "rec_id", "score", "down_cnt", "1-new_score as score")
  }
}


测试数据：
theme_info文件：
10008611 1 tom 洞庭湖|鄱阳湖|巢湖
10008611 2 mary 洞庭湖|鄱阳湖
10008612 2 jack 山川|河流|大海
10008613 3 miss 唯美|二次元|复古|乡村
10008614 2 tom 洞庭湖|太湖|洪泽湖
10008615 3 tom 巢湖|鄱阳湖|淮河|长江
10008616 2 jack 山川|河流|大海|湖泊|荒漠
10008617 3 miss 唯美|二次元
10008618 1 miss 洞庭湖

theme_candidate_info文件：
10008611 1 tom 洞庭湖|鄱阳湖|巢湖
10008614 2 tom 洞庭湖|太湖|洪泽湖
10008615 3 tom 巢湖|鄱阳湖|淮河|长江
10008612 2 jack 山川|河流|大海
10008616 2 jack 山川|河流|大海|湖泊|荒漠
10008613 3 miss 唯美|二次元|复古|乡村
10008617 3 miss 唯美|二次元

theme_heat_info文件：
10008614 2 100
10008615 3 90
10008616 2 80
10008617 3 70
10008618 3 60

测试结果：
+--------+---------+--------+-----+--------+-----+
|theme_id|theme_ver|  rec_id|score|down_cnt|score|
+--------+---------+--------+-----+--------+-----+
|10008613|        3|10008617|  2.0|      70|  0.9|
|10008613|        3|10008615|  0.0|      90|  0.8|
|10008616|        2|10008614|  0.0|     100|  0.9|
|10008617|        3|10008615|  0.0|      90|  0.9|
|10008615|        3|10008617|  0.0|      70|  0.9|
|10008614|        2|10008616|  0.0|      80|  0.9|
|10008612|        2|10008616|  2.2|      80|  0.9|
|10008612|        2|10008614|  0.0|     100|  0.8|
|10008611|        2|10008614|  0.5|     100|  0.9|
|10008611|        2|10008616|  0.0|      80|  0.8|
+--------+---------+--------+-----+--------+-----+

版本2

package sparkSQL

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.{DataFrame, Row, SparkSession}

import scala.collection.mutable

object TagSimV2 {

  private val logger = Logger.getLogger(TagSimV2.getClass)

  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder()
      .appName(this.getClass.getName)
      .master("local[2]")
      .getOrCreate()
    val sc = spark.sparkContext


    val themeInfoRDD = sc.textFile("F:\\ideaProjects\\spark-version2\\src\\main\\resources\\tagsim\\theme_info")
    val themeHeatInfoRDD = sc.textFile("F:\\ideaProjects\\spark-version2\\src\\main\\resources\\tagsim\\theme_heat_info")
    val themeCandidateRDD = sc.textFile("F:\\ideaProjects\\spark-version2\\src\\main\\resources\\tagsim\\theme_candidate_info")
    val tagWeightInfoRDD = sc.textFile("F:\\ideaProjects\\spark-version2\\src\\main\\resources\\tagsim\\tag_weight_info")
    val tagSimScoreInfoRDD = sc.textFile("F:\\ideaProjects\\spark-version2\\src\\main\\resources\\tagsim\\tagsim_score_info")

    import spark.implicits._
    //获取主题资源数据集
    val themeInfoDF = themeInfoRDD.map {
      line => {
        val arr = line.split(" ")
        val theme_id = arr(0)
        val theme_ver = arr(1)
        val designer = arr(2)
        val tags = arr(3)
        (theme_id, theme_ver, designer, tags)
      }
    }.toDF("theme_id", "theme_ver", "designer", "tags")

    //获取主题资源候选数据集
    val themeCandidateDF = themeCandidateRDD.map {
      line => {
        val arr = line.split(" ")
        val rec_id = arr(0)
        val rec_ver = arr(1)
        val designer = arr(2)
        val tags = arr(3)
        (rec_id, rec_ver, designer, tags)
      }
    }.toDF("rec_id", "rec_ver", "designer", "tags")

    //获取主题资源热度数据集
    val themeHeatInfoDF = themeHeatInfoRDD.map {
      line => {
        val arr = line.split(" ")
        val theme_id = arr(0)
        val theme_ver = arr(1)
        val down_cnt = arr(2)
        (theme_id, theme_ver, down_cnt)
      }
    }.toDF("theme_id", "theme_ver", "down_cnt")

    //获取列权重数据集
    val tagWeightInfoDF = tagWeightInfoRDD.map {
      line => {
        val arr = line.split(" ")
        val tag = arr(0)
        val weight = arr(1)
        (tag, weight.toDouble)
      }
    }.toDF("tag", "weight")

    //获取标签相似得分数据集
    val tagSimScoreInfoDF = tagSimScoreInfoRDD.map {
      line => {
        val arr = line.split(" ")
        val tag = arr(0)
        val simTag = arr(1)
        val simScore = arr(2)
        (tag, simTag, simScore.toDouble)
      }
    }.toDF("tag", "sim_tag", "sim_score")


    import org.apache.spark.sql.functions._
    //将主题资源数据集和候选数据集按照版本关联，去掉同源资源
    val themeAndRecInfoDF = themeInfoDF.join(themeCandidateDF, themeInfoDF("theme_ver") === themeCandidateDF("rec_ver"), "left")
      .filter(themeInfoDF("theme_id") =!= themeCandidateDF("rec_id"))
      .select(themeInfoDF("theme_id"), themeInfoDF("theme_ver"), themeCandidateDF("rec_id"), themeCandidateDF("rec_ver"))

    //获取主表标签
    val themeTagInfoDF = themeInfoDF.select("theme_id", "theme_ver", "tags")
    //获取标签权重
    val tagWeightMap = getTagWeightMap(tagWeightInfoDF)
    //获取标签相似得分
    val tagSimScoreMap = getTagSimScoreMap(tagSimScoreInfoDF)
    //初始化得分
    var roundColWeightScoreDF = themeAndRecInfoDF.withColumn("score", lit(0.0)).select("theme_id", "theme_ver", "rec_id", "score")

    roundColWeightScoreDF = roundColWeightScore(roundColWeightScoreDF, themeTagInfoDF, tagWeightMap, tagSimScoreMap)


    roundColWeightScoreDF.show()

    //考虑热度因素，进行最终得分计算
    val colWeightAndHeatScoreDF = colWeightAndHeatScore(roundColWeightScoreDF, themeHeatInfoDF)
    colWeightAndHeatScoreDF.show()
  }

  def getTagSimScoreMap(tagSimScoreInfoDF: DataFrame) = {
    tagSimScoreInfoDF.rdd.map {
      case Row(tag: String, simTag: String, score: Double) => {
        val key = s"$tag-$simTag"
        (key, score)
      }
    }.collect().toMap
  }

  def getTagWeightMap(tagWeightInfoDF: DataFrame): Map[String, Double] = {
    tagWeightInfoDF.rdd.map {
      case Row(tag: String, weight: Double) => (tag, weight)
    }.collect().toMap
  }

  def getTagAndRecTagWeightMap(tagSet: Set[String], tagWeightMap: Map[String, Double]): mutable.Map[String, Double] = {
    import scala.collection.mutable
    val mmap: mutable.Map[String,Double] = mutable.Map[String,Double]()
    tagSet.foreach {
      tag => {
        val weight = tagWeightMap.getOrElse(tag, 0.0)
        mmap += ((tag, weight))
      }
    }
    mmap
  }

  def roundColWeightScore(roundColWeightScoreDF: DataFrame, themeInfoDF: DataFrame, tagWeightMap: Map[String, Double], tagSimScoreMap: Map[String, Double]) = {
    import scala.collection.mutable
    import org.apache.spark.sql.functions._
    import themeInfoDF.sparkSession.implicits._
    var rank = roundColWeightScoreDF.join(themeInfoDF.toDF("theme_id", "theme_ver", "theme_tag"), Seq("theme_id", "theme_ver"))
      .join(themeInfoDF.toDF("rec_id", "theme_ver", "rec_tag"), Seq("rec_id", "theme_ver"))
      .select("theme_id", "theme_ver", "rec_id", "theme_tag", "rec_tag", "score")
      .map {
        case Row(theme_id: String, theme_ver: String, rec_id: String, theme_tag: String, rec_tag: String, score: Double) => {
          val themeSet = theme_tag.split("\\|").toSet
          val recSet = rec_tag.split("\\|").toSet
          var sim = 0.0
          if (themeSet.nonEmpty && recSet.nonEmpty) {
            //获取主表标签权重
            val map1 = getTagAndRecTagWeightMap(themeSet, tagWeightMap)
            //获取候选表标签权重
            val map2 = getTagAndRecTagWeightMap(recSet, tagWeightMap)

            //两两组合主表和候选表标签，并将相似得分存储内存
            val listBF: mutable.ListBuffer[(String, String, Double)] = new mutable.ListBuffer[(String, String, Double)]
            for (tag1 <- themeSet) {
              for (tag2 <- recSet) {
                val key = s"$tag1-$tag2"
                val simScore = tagSimScoreMap.getOrElse(key, 0.0)
                listBF += ((tag1, tag2, simScore))
              }
            }

            val sortList = listBF.toList.sortBy(_._3).reverse

            var sumWeight: Double = 0.0
            var sumScore: Double = 0.0
            for(tuple <- sortList) {
              val key1 = tuple._1
              val key2 = tuple._2
              val score = tuple._3
              //主表标签权重
              val weight1 = map1.getOrElse(key1,0.0)
              //候选标签权重
              val weight2 = map2.getOrElse(key2,0.0)
              logger.error(s"key1:$key1 weight1:$weight1|key2:$key2 weight2:$weight2|score:$score")
              logger.error(s"sumWeight:$sumWeight sumScore:$sumScore")
              if(weight1 !=0.0 && weight2 !=0.0) {
                val minWeight = math.min(weight1,weight2)
                val tmpWeight1 = weight1 - minWeight
                val tmpWeight2 = weight2 - minWeight
                //更新主表和候选表相似标签的权重
                map1(key1) = tmpWeight1
                map2(key2) = tmpWeight2

                val tmpScore = minWeight*score
                sumWeight += minWeight
                sumScore += tmpScore
              }
            }
            sim = sumScore / sumWeight
          }
          (theme_id, theme_ver, rec_id, score + sim)
        }
      }.toDF("theme_id", "theme_ver", "rec_id", "score")

    rank = rank.withColumn("_rank",
      row_number().over(Window.partitionBy("theme_id", "theme_ver").orderBy(desc("score"))))
      .filter(col("_rank") <= 10)
      .drop("_rank")
      .select("theme_id", "theme_ver", "rec_id", "score")

    rank
  }

  def colWeightAndHeatScore(roundColWeightScoreDF: DataFrame, themeHeatInfoDF: DataFrame) = {
    import org.apache.spark.sql.functions._
    roundColWeightScoreDF.join(themeHeatInfoDF.toDF("rec_id", "theme_ver", "down_cnt"), Seq("rec_id", "theme_ver"))
      .select("theme_id", "theme_ver", "rec_id", "score", "down_cnt")
      .withColumn("_rank",
        row_number().over(Window.partitionBy(col("theme_id"), col("theme_ver")).orderBy(desc("score"), desc("down_cnt"))))
      .withColumn("new_score", col("_rank") / 10)
      .selectExpr("theme_id", "theme_ver", "rec_id", "score", "down_cnt", "1-new_score as score")
  }

}

假设两个资源及标签权重如下：
资源1：tag1 1 tag2 0.6
资源2：tag3 1 tag4 0.2

标签之间相似度得分如下：
tag1-tag3 0.5
tag1-tag4 1
tag2-tag3 0
tag2-tag4 0.4

计算规则：
a.按照相似度得分进行降序排列
tag1-tag4 1
tag1-tag3 0.5
tag2-tag4 0.4
tag2-tag3 0

b.计算每个标签对的相似得分
标签对1：tag1-tag4 1
初始标签权重：tag1 1 tag2 0.6 tag3 1 tag4 0.2
min_weight1=min(1,0.2)=0.2
sim_score1=min_weight1*1=0.2
更新标签权重1：tag1 0.8 tag2 0.6 tag3 1 tag4 0

标签对2：tag1-tag3 0.5
min_weight2=min(0.8,1)=0.8
sim_score2=min_weight2*0.5=0.4
更新标签权重2：tag1 0 tag2 0.6 tag3 0.2 tag4 0

标签对3：tag2-tag4 0.4
min_weight3=0
sim_score3=0

标签对4：tag2-tag3 0
min_weight4=0.2
sim_score4=0
...
相似标签总权重sum_weight=min_weight1 + min_weight2 +...
相似标签加权得分sum_score=sim_score1 + sim_score2 +...
最终相似度得分sim_score=sum_score/sum_weight

注意：
每步计算采用标签对中较小的权重，然后两个标签对的权重同时减去该较小的权重，
这样可以避免过度依赖某个权重过高的标签。
直到所有标签对完成或一个资源所有标签的权重为0