推荐系统笔记-02-文章相似计算_spark bucketedrandomprojectionlsh inputcols-CSDN博客

本文链接：https://blog.csdn.net/fighting_one_piece/article/details/103731264

1、相关环境

hadoop-2.10.0

hive-3.1.2

hbase-2.2.2

spark-2.4.4

2、相关表结构

HIVE

CREATE TABLE T_ARTICLE_VECTOR(
ARTICLE_ID STRING,
CHANNEL_ID INT,
VECTOR ARRAY<DOUBLE>
)
COMMENT 'article vector table'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/user/hive/warehouse/portal/article_vector';

HBASE

create 'article_similarity', 'sim'

3、相关Python实现

# -*- coding:utf-8 -*-

import os
import sys
BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, os.path.join(BASE_PATH))
print sys.path
from offline import BaseSparkSession

default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
    reload(sys)
    sys.setdefaultencoding(default_encoding)

os.environ['PYSPARK_PYTHON'] = 'F:\develop\python\Python27\python.exe'
os.environ['HADOOP_HOME'] = 'F:\develop\hadoop\hadoop-2.10.0'
os.environ['HADOOP_CONF_DIR'] = 'F:\develop\hadoop\hadoop-2.10.0-conf'
os.environ['SPARK_HOME'] = 'F:\develop\spark\spark-2.4.4-bin-hadoop2.7'


class ArticleVectorGenerator(BaseSparkSession):

    def __init__(self):
        self.SPARK_APP_NAME = 'article_vector_generator'
        self.SPARK_MASTER_URL = 'yarn'
        self.SPARK_YARN_QUEUE = 'queue3'
        self.ENABLE_HIVE_SUPPORT = True
        self.spark_session = self.create_spark_session()

    # 生成不同频道下的词向量模型
    def gen_channel_word_vector(self):
        self.spark_session.sql("use portal")
        channel_id_list = [1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008]
        for channel_id in channel_id_list:
            sql = "select id, channel_id, complete_content_words from t_complete_article where channel_id = " + str(channel_id)
            complete_article_df = self.spark_session.sql(sql)
            from pyspark.ml.feature import Word2Vec
            word2vec = Word2Vec(vectorSize=100, inputCol="complete_content_words", outputCol="word_vector_features", minCount=1)
            word2vec_model = word2vec.fit(complete_article_df)
            word2vec_model.save("hdfs://192.168.0.1:9000/user/models/word2vec/{}.model".format(channel_id))

    # 生成文章向量
    def gen_article_vector(self):
        channel_id_list = [1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008]
        for channel_id in channel_id_list:
            # 加载词向量模型
            from pyspark.ml.feature import Word2VecModel
            word2vec_model = Word2VecModel.load("hdfs://192.168.0.123:9000/user/models/word2vec/{}.model".format(channel_id))
            vectors = word2vec_model.getVectors()
            vectors.show()

            self.spark_session.sql("use portal")

            # 文章关键词词向量
            sql = "select article_id, channel_id, kw.key keyword, kw.value weight from t_article_profile " \
                  "lateral view explode(keywords) kw where channel_id = " + str(channel_id)
            article_profile_df = self.spark_session.sql(sql)
            article_profile_df.show()
            article_keyword_vector_df = article_profile_df.join(vectors, article_profile_df.keyword == vectors.word, "inner") \
                .select("article_id", "channel_id", "keyword", "weight", "vector")
            article_keyword_vector_df = article_keyword_vector_df.rdd.map(
                lambda row: (row.article_id, row.channel_id, row.keyword, (row.weight * row.vector))
            ).toDF(["article_id", "channel_id", "keyword", "vector"])
            article_keyword_vector_df.show()

            article_keyword_vector_df.registerTempTable("tmp_article_keyword_vector")
            sql = "select article_id, min(channel_id) channel_id, collect_set(vector) vectors " \
                  "from tmp_article_keyword_vector group by article_id"
            article_vectors_df = self.spark_session.sql(sql)
            article_vectors_df.show()

            # 计算文章关键词词向量平均值作为文章向量
            def calculate_average_value(row):
                vector_summation = 0
                for vector in row.vectors:
                    vector_summation += vector
                # 向量类型转数据类型存储Hive
                return row.article_id, row.channel_id, [float(i) for i in (vector_summation / len(row.vectors)).toArray()]
            article_vector_df = article_vectors_df.rdd.map(calculate_average_value).toDF(["article_id", "channel_id", "vectors"])
            article_vector_df.show()
            article_vector_df.write.insertInto("t_article_vector")

class ArticleClusteringGenerator(BaseSparkSession):

    def __init__(self):
        self.SPARK_APP_NAME = 'article_clustering_generator'
        self.SPARK_MASTER_URL = 'yarn'
        self.SPARK_YARN_QUEUE = 'queue3'
        self.ENABLE_HIVE_SUPPORT = True
        self.spark_session = self.create_spark_session()

    # KMEANS 文章聚类
    def gen_kmeans_clustering(self):
        self.spark_session.sql("use portal")
        channel_id_list = [1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008]
        for channel_id in channel_id_list:
            sql = "select article_id, channel_id, vector from t_article_vector where channel_id = " + str(channel_id)
            article_vector_df = self.spark_session.sql(sql)
            from pyspark.ml.clustering import BisectingKMeans
            bisecting_kmeans = BisectingKMeans(k=100, minDivisibleClusterSize=50, featuresCol="vector")
            bisecting_kmeans_model = bisecting_kmeans.fit(article_vector_df)
            bisecting_kmeans_model.save("hdfs://192.168.0.1:9000/user/models/bisecting_kmeans/{}.model".format(channel_id))


# Locality Sensitive Hashing 局部敏感哈希
class ArticleLSHGenerator(BaseSparkSession):

    def __init__(self):
        self.SPARK_APP_NAME = 'article_lsh_generator'
        self.SPARK_MASTER_URL = 'yarn'
        self.SPARK_YARN_QUEUE = 'queue3'
        self.ENABLE_HIVE_SUPPORT = True
        self.spark_session = self.create_spark_session()

    # 生成相似度
    def gen_lsh_similarity(self, channel_id, article_vector_df):
        self.spark_session.sql("use portal")
        sql = "select article_id, vector from t_article_vector where channel_id = " + str(channel_id)
        train_article_vector_df = self.spark_session.sql(sql)

        from pyspark.ml.linalg import Vectors

        # 数组转向量
        def convert_array_to_vector(row):
            return row.article_id, Vectors.dense(row.vector)
        train_article_vector_df = train_article_vector_df.rdd.map(convert_array_to_vector).toDF(["article_id", "vector"])
        train_article_vector_df.show()
        article_vector_df = article_vector_df.rdd.map(convert_array_to_vector).toDF(["article_id", "vector"])
        article_vector_df.show()

        # LSH模型计算相似度
        from pyspark.ml.feature import BucketedRandomProjectionLSH
        bucketed_random_projection_lsh = BucketedRandomProjectionLSH(inputCol="vector",
                                                                     outputCol="brp_lsh",
                                                                     numHashTables=4,
                                                                     bucketLength=10)
        bucketed_random_projection_lsh_model = bucketed_random_projection_lsh.fit(train_article_vector_df)

        similarities = bucketed_random_projection_lsh_model.approxSimilarityJoin(
            article_vector_df, train_article_vector_df, threshold=2.0, distCol="euclidean_distance")
        similarities.sort(['euclidean_distance']).show()

        # 数据存储HBase 'article_similarity' 'sim'
        def insert_hbase(partition):

            sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
            from recoutils.hbase_utils import HBaseUtils
            hbase_utils = HBaseUtils(host="192.168.0.1", port=9090, size=5)

            for row in partition:
                article_id = row.datasetA.article_id
                s_article_id = row.datasetB.article_id;
                if article_id == s_article_id:
                    continue
                else:
                    hbase_utils.insert("article_similarity", str(article_id).encode(),
                        {"sim:{}".format(s_article_id).encode(): b'%0.4f' % row.euclidean_distance})
        similarities.foreachPartition(insert_hbase)


if __name__ == '__main__':
    article_vector_generator = ArticleVectorGenerator()
    article_vector_generator.gen_channel_word_vector()
    article_vector_generator.gen_article_vector()

    article_lsh_generator = ArticleLSHGenerator()
    article_lsh_generator.spark_session.sql("use portal")
    channel_id_list = [1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008]
    for channel_id in channel_id_list:
        sql = "select article_id, vector from t_article_vector where channel_id = " + str(channel_id)
        test_article_vector_df = article_lsh_generator.spark_session.sql(sql)
        article_lsh_generator.gen_lsh_similarity(channel_id, test_article_vector_df)