1、相关环境
hadoop-2.10.0
hive-3.1.2
hbase-2.2.2
spark-2.4.4
2、相关表结构
Hive
CREATE TABLE T_USER_OP_LOG(
USER_ID BIGINT,
ARTICLE_ID STRING,
CHANNEL_ID INT,
OP_TYPE INT COMMENT '1 display 2 click 3 collect 4 share',
OP_TIME STRING,
OP_DURATION INT,
ALGO INT
)
COMMENT 'user operation log table'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/user/hive/warehouse/portal/user_op_log';
CREATE TABLE T_USER_BEHAVIOUR(
USER_ID BIGINT,
ARTICLE_ID STRING,
CHANNEL_ID INT,
DISPLAY_FLAG INT,
CLICK_FLAG INT,
COLLECT_FLAG INT,
SHARE_FLAG INT,
OP_TIME STRING,
OP_DURATION INT
)
COMMENT 'user behaviour table'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/user/hive/warehouse/portal/user_behaviour';
HBase
create 'user_profile', 'b', 'p'
3、相关Python实现
# -*- coding:utf-8 -*-
import os
import sys
import numpy as np
from datetime import datetime
BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, os.path.join(BASE_PATH))
print sys.path
from offline import BaseSparkSession
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
reload(sys)
sys.setdefaultencoding(default_encoding)
os.environ['PYSPARK_PYTHON'] = 'F:\develop\python\Python27\python.exe'
os.environ['HADOOP_HOME'] = 'F:\develop\hadoop\hadoop-2.10.0'
os.environ['HADOOP_CONF_DIR'] = 'F:\develop\hadoop\hadoop-2.10.0-conf'
os.environ['SPARK_HOME'] = 'F:\develop\spark\spark-2.4.4-bin-hadoop2.7'
class UserProfileGenerator(BaseSparkSession):
def __init__(self):
self.SPARK_APP_NAME = 'user_profile_generator'
self.SPARK_MASTER_URL = 'yarn'
self.SPARK_YARN_QUEUE = 'queue3'
self.ENABLE_HIVE_SUPPORT = True
self.spark_session = self.create_spark_session()
# 生成用户行为数据
def gen_user_behaviour(self):
self.spark_session.sql("use portal")
user_op_log_df = self.spark_session.sql("select * from t_user_op_log")
def convert_to_behaviour(partition):
for row in partition:
if row.op_type == 1:
yield row.user_id, row.article_id, row.channel_id, 1, 0, 0, 0, row.op_time, row.op_duration
elif row.op_type == 2:
yield row.user_id, row.article_id, row.channel_id, 1, 1, 0, 0, row.op_time, row.op_duration
elif row.op_type == 3:
yield row.user_id, row.article_id, row.channel_id, 1, 1, 1, 0, row.op_time, row.op_duration
elif row.op_type == 4:
yield row.user_id, row.article_id, row.channel_id, 1, 1, 0, 1, row.op_time, row.op_duration
user_behaviour_df = user_op_log_df.rdd.mapPartitions(convert_to_behaviour) \
.toDF(["user_id", "article_id", "channel_id", "display_flag", "click_flag",
"collect_flag", "share_flag", "op_time", "op_duration"])
user_behaviour_df.show()
# 合并用户文章行为
user_behaviour_df.registerTempTable("tmp_user_behaviour")
sql = "select user_id, article_id, min(channel_id) channel_id, max(display_flag) display_flag, " \
"max(click_flag) click_flag, max(collect_flag) collect_flag, max(share_flag) share_flag," \
"max(op_time) op_time, max(op_duration) op_duration from tmp_user_behaviour group by user_id, article_id"
user_behaviour_df = self.spark_session.sql(sql)
user_behaviour_df.show()
user_behaviour_df.write.insertInto("t_user_behaviour")
# 生成用户基本画像
def gen_user_basic_profile(self):
self.spark_session.sql("use portal")
user_df = self.spark_session.sql("select * from t_user")
user_df.show()
def insert_user_profile(partition):
sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from recoutils.hbase_utils import HBaseUtils
hbase_utils = HBaseUtils(host="192.168.0.1", port=9090, size=5)
for row in partition:
basic_info = {
"b:name": row.name,
"b:gender": str(row.gender),
"b:age": str(row.age),
"b:mobile_phone": row.mobile_phone,
"b:email": row.email
}
hbase_utils.insert("user_profile", "u:{}".format(row.id).encode(), basic_info)
user_df.foreachPartition(insert_user_profile)
# 生成用户偏爱喜好画像
def gen_user_preference_profile(self):
self.spark_session.sql("use portal")
# 关联用户文章主题词
sql = "select ub.*, ap.topics from t_user_behaviour ub " \
"left join t_article_profile ap on ub.article_id = ap.article_id " \
"where (ub.click_flag != 0 or ub.collect_flag != 0 or ub.share_flag != 0) and ap.topics is not null"
user_behaviour_df = self.spark_session.sql(sql)
user_behaviour_df.show()
import pyspark.sql.functions as F
user_behaviour_df = user_behaviour_df.withColumn("topic", F.explode("topics")).drop("topics")
user_behaviour_df.show()
def insert_user_profile(partition):
sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from recoutils.hbase_utils import HBaseUtils
hbase_utils = HBaseUtils(host="192.168.0.1", port=9090, size=5)
for row in partition:
# 计算频道主题标签权值 时间衰减系数 * 行为权重和
t = datetime.now() - datetime.strptime(row.op_time, '%Y-%m-%d %H:%M:%S')
time_exp = 1 / (np.log(t.days + 1) + 1)
duration_weight = 2 if row.op_duration > 1000 else 1
weight = time_exp * (row.click_flag * 5 + row.collect_flag * 3 + row.share_flag * 4 + duration_weight)
hbase_utils.insert("user_profile", "u:{}".format(row.user_id).encode(),
{"p:{}:{}".format(row.channel_id, row.topic).encode(): b'%0.4f' % weight})
user_behaviour_df.foreachPartition(insert_user_profile)
if __name__ == '__main__':
user_profile_generator = UserProfileGenerator()
user_profile_generator.gen_user_behaviour()
user_profile_generator.gen_user_basic_profile()
user_profile_generator.gen_user_preference_profile()