1、相关环境
hadoop-2.10.0
hive-3.1.2
hbase-2.2.2
spark-2.4.4
2、相关表结构
HBase
alter 'multiple_recall', {NAME=>'content', TTL=>2592000, VERSIONS=>9999}
3、相关Python实现
# -*- coding:utf-8 -*-
import os
import sys
import numpy as np
from datetime import datetime
BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, os.path.join(BASE_PATH))
print sys.path
from offline import BaseSparkSession
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
reload(sys)
sys.setdefaultencoding(default_encoding)
os.environ['PYSPARK_PYTHON'] = 'F:\develop\python\Python27\python.exe'
os.environ['HADOOP_HOME'] = 'F:\develop\hadoop\hadoop-2.10.0'
os.environ['HADOOP_CONF_DIR'] = 'F:\develop\hadoop\hadoop-2.10.0-conf'
os.environ['SPARK_HOME'] = 'F:\develop\spark\spark-2.4.4-bin-hadoop2.7'
class UserRecallBasedContent(BaseSparkSession):
def __init__(self):
self.SPARK_APP_NAME = 'user_recall_based_content'
self.SPARK_MASTER_URL = 'yarn'
self.SPARK_YARN_QUEUE = 'queue3'
self.ENABLE_HIVE_SUPPORT = True
self.spark_session = self.create_spark_session()
# 生成用户召回结果
def gen_user_recall(self):
self.spark_session.sql("use portal")
# 用户文章点击行为
sql = "select user_id, article_id, channel_id from t_user_behaviour where click_flag = 1"
user_article_click_behaviour_df = self.spark_session.sql(sql)
user_article_click_behaviour_df.show()
# 用户召回数据存储HBase
def insert_user_recall(partition):
sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
import json
from recoutils.hbase_utils import HBaseUtils
hbase_utils = HBaseUtils(host="192.168.0.1", port=9090, size=5)
for row in partition:
# 获取文章相似文章列表
article_id = row.article_id
art_sim_art_result = hbase_utils.read_rows("article_similarity",
[b"{}".format(article_id)],
columns=[b"sim"])
if art_sim_art_result:
# 排序TOPK
sorted_result = sorted(art_sim_art_result[0][1].items(), key=lambda item: item[1], reverse=True)
art_sim_art_list = [art_sim[0].split(":")[1] for art_sim in sorted_result][:5]
print "{} sim arts {}".format(article_id, art_sim_art_list)
# 获取历史召回数据
history_recall_list = hbase_utils.read_cells("history_recall",
"u:{}".format(row.user_id).encode(),
"channel:{}".format(row.channel_id).encode())
history_recommend_list = []
for history_recall in history_recall_list:
history_recommend_list.extend(eval(history_recall))
# 过滤历史召回数据
recommend_list = list(set(art_sim_art_list) - set(history_recommend_list))
# 存储召回数据和历史召回数据
if recommend_list:
hbase_utils.insert("multiple_recall",
"u:{}".format(row.user_id).encode(),
{"content:{}".format(row.channel_id).encode(): json.dumps(recommend_list).encode()})
hbase_utils.insert("history_recall",
"u:{}".format(row.user_id).encode(),
{"channel:{}".format(row.channel_id).encode(): json.dumps(recommend_list).encode()})
user_article_click_behaviour_df.foreachPartition(insert_user_recall)
if __name__ == '__main__':
user_recall_based_content = UserRecallBasedContent()
user_recall_based_content.gen_user_recall()