1、相关环境
hadoop-2.10.0
hive-3.1.2
hbase-2.2.2
spark-2.4.4
2、相关表结构
HIVE
CREATE TABLE T_ARTICLE_VECTOR(
ARTICLE_ID STRING,
CHANNEL_ID INT,
VECTOR ARRAY<DOUBLE>
)
COMMENT 'article vector table'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/user/hive/warehouse/portal/article_vector';
HBASE
create 'article_similarity', 'sim'
3、相关Python实现
# -*- coding:utf-8 -*-
import os
import sys
BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, os.path.join(BASE_PATH))
print sys.path
from offline import BaseSparkSession
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
reload(sys)
sys.setdefaultencoding(default_encoding)
os.environ['PYSPARK_PYTHON'] = 'F:\develop\python\Python27\python.exe'
os.environ['HADOOP_HOME'] = 'F:\develop\hadoop\hadoop-2.10.0'
os.environ['HADOOP_CONF_DIR'] = 'F:\develop\hadoop\hadoop-2.10.0-conf'
os.environ['SPARK_HOME'] = 'F:\develop\spark\spark-2.4.4-bin-hadoop2.7'
class ArticleVectorGenerator(BaseSparkSession):
def __init__(self):
self.SPARK_APP_NAME = 'article_vector_generator'
self.SPARK_MASTER_URL = 'yarn'
self.SPARK_YARN_QUEUE = 'queue3'
self.ENABLE_HIVE_SUPPORT = True
self.spark_session = self.create_spark_session()
# 生成不同频道下的词向量模型
def gen_channel_word_vector(self):
self.spark_session.sql("use portal")
channel_id_list = [1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008]
for channel_id in channel_id_list:
sql = "select id, channel_id, complete_content_words from t_complete_article where channel_id = " + str(channel_id)
complete_article_df = self.spark_session.sql(sql)
from pyspark.ml.feature import Word2Vec
word2vec = Word2Vec(vectorSize=100, inputCol="complete_content_words", outputCol="word_vector_features", minCount=1)
word2vec_model = word2vec.fit(complete_article_df)
word2vec_model.save("hdfs://192.168.0.1:9000/user/models/word2vec/{}.model".format(channel_id))
# 生成文章向量
def gen_article_vector(self):
channel_id_list = [1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008]
for channel_id in channel_id_list:
# 加载词向量模型
from pyspark.ml.feature import Word2VecModel
word2vec_model = Word2VecModel.load("hdfs://192.168.0.123:9000/user/models/word2vec/{}.model".format(channel_id))
vectors = word2vec_model.getVectors()
vectors.show()
self.spark_session.sql("use portal")
# 文章关键词词向量
sql = "select article_id, channel_id, kw.key keyword, kw.value weight from t_article_profile " \
"lateral view explode(keywords) kw where channel_id = " + str(channel_id)
article_profile_df = self.spark_session.sql(sql)
article_profile_df.show()
article_keyword_vector_df = article_profile_df.join(vectors, article_profile_df.keyword == vectors.word, "inner") \
.select("article_id", "channel_id", "keyword", "weight", "vector")
article_keyword_vector_df = article_keyword_vector_df.rdd.map(
lambda row: (row.article_id, row.channel_id, row.keyword, (row.weight * row.vector))
).toDF(["article_id", "channel_id", "keyword", "vector"])
article_keyword_vector_df.show()
article_keyword_vector_df.registerTempTable("tmp_article_keyword_vector")
sql = "select article_id, min(channel_id) channel_id, collect_set(vector) vectors " \
"from tmp_article_keyword_vector group by article_id"
article_vectors_df = self.spark_session.sql(sql)
article_vectors_df.show()
# 计算文章关键词词向量平均值作为文章向量
def calculate_average_value(row):
vector_summation = 0
for vector in row.vectors:
vector_summation += vector
# 向量类型转数据类型存储Hive
return row.article_id, row.channel_id, [float(i) for i in (vector_summation / len(row.vectors)).toArray()]
article_vector_df = article_vectors_df.rdd.map(calculate_average_value).toDF(["article_id", "channel_id", "vectors"])
article_vector_df.show()
article_vector_df.write.insertInto("t_article_vector")
class ArticleClusteringGenerator(BaseSparkSession):
def __init__(self):
self.SPARK_APP_NAME = 'article_clustering_generator'
self.SPARK_MASTER_URL = 'yarn'
self.SPARK_YARN_QUEUE = 'queue3'
self.ENABLE_HIVE_SUPPORT = True
self.spark_session = self.create_spark_session()
# KMEANS 文章聚类
def gen_kmeans_clustering(self):
self.spark_session.sql("use portal")
channel_id_list = [1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008]
for channel_id in channel_id_list:
sql = "select article_id, channel_id, vector from t_article_vector where channel_id = " + str(channel_id)
article_vector_df = self.spark_session.sql(sql)
from pyspark.ml.clustering import BisectingKMeans
bisecting_kmeans = BisectingKMeans(k=100, minDivisibleClusterSize=50, featuresCol="vector")
bisecting_kmeans_model = bisecting_kmeans.fit(article_vector_df)
bisecting_kmeans_model.save("hdfs://192.168.0.1:9000/user/models/bisecting_kmeans/{}.model".format(channel_id))
# Locality Sensitive Hashing 局部敏感哈希
class ArticleLSHGenerator(BaseSparkSession):
def __init__(self):
self.SPARK_APP_NAME = 'article_lsh_generator'
self.SPARK_MASTER_URL = 'yarn'
self.SPARK_YARN_QUEUE = 'queue3'
self.ENABLE_HIVE_SUPPORT = True
self.spark_session = self.create_spark_session()
# 生成相似度
def gen_lsh_similarity(self, channel_id, article_vector_df):
self.spark_session.sql("use portal")
sql = "select article_id, vector from t_article_vector where channel_id = " + str(channel_id)
train_article_vector_df = self.spark_session.sql(sql)
from pyspark.ml.linalg import Vectors
# 数组转向量
def convert_array_to_vector(row):
return row.article_id, Vectors.dense(row.vector)
train_article_vector_df = train_article_vector_df.rdd.map(convert_array_to_vector).toDF(["article_id", "vector"])
train_article_vector_df.show()
article_vector_df = article_vector_df.rdd.map(convert_array_to_vector).toDF(["article_id", "vector"])
article_vector_df.show()
# LSH模型计算相似度
from pyspark.ml.feature import BucketedRandomProjectionLSH
bucketed_random_projection_lsh = BucketedRandomProjectionLSH(inputCol="vector",
outputCol="brp_lsh",
numHashTables=4,
bucketLength=10)
bucketed_random_projection_lsh_model = bucketed_random_projection_lsh.fit(train_article_vector_df)
similarities = bucketed_random_projection_lsh_model.approxSimilarityJoin(
article_vector_df, train_article_vector_df, threshold=2.0, distCol="euclidean_distance")
similarities.sort(['euclidean_distance']).show()
# 数据存储HBase 'article_similarity' 'sim'
def insert_hbase(partition):
sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from recoutils.hbase_utils import HBaseUtils
hbase_utils = HBaseUtils(host="192.168.0.1", port=9090, size=5)
for row in partition:
article_id = row.datasetA.article_id
s_article_id = row.datasetB.article_id;
if article_id == s_article_id:
continue
else:
hbase_utils.insert("article_similarity", str(article_id).encode(),
{"sim:{}".format(s_article_id).encode(): b'%0.4f' % row.euclidean_distance})
similarities.foreachPartition(insert_hbase)
if __name__ == '__main__':
article_vector_generator = ArticleVectorGenerator()
article_vector_generator.gen_channel_word_vector()
article_vector_generator.gen_article_vector()
article_lsh_generator = ArticleLSHGenerator()
article_lsh_generator.spark_session.sql("use portal")
channel_id_list = [1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008]
for channel_id in channel_id_list:
sql = "select article_id, vector from t_article_vector where channel_id = " + str(channel_id)
test_article_vector_df = article_lsh_generator.spark_session.sql(sql)
article_lsh_generator.gen_lsh_similarity(channel_id, test_article_vector_df)