1、相关环境
hadoop-2.10.0
hive-3.1.2
hbase-2.2.2
spark-2.4.4
kafka-2.12-2.4.0
zookeeper-3.5.6
spark-streaming-kafka-0-8_2.11:2.4.4
2、相关表结构
HBase
alter 'multiple_recall', {NAME=>'online', TTL=>2592000, VERSIONS=>9999}
3、相关Python实现
# -*- coding:utf-8 -*-
import sys
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
reload(sys)
sys.setdefaultencoding(default_encoding)
class BaseSparkEnvironment(object):
SPARK_APP_NAME = None
SPARK_MASTER_URL = 'local[*]'
SPARK_EXECUTOR_CORES = 2
SPARK_EXECUTOR_MEMORY = '4g'
SPARK_EXECUTOR_INSTANCES = 2
SPARK_YARN_QUEUE = None
def create_spark_session(self):
spark_conf = SparkConf()
spark_conf.setAll(
(
("spark.app.name", self.SPARK_APP_NAME),
("spark.master", self.SPARK_MASTER_URL),
("spark.executor.cores", self.SPARK_EXECUTOR_CORES),
("spark.executor.memory", self.SPARK_EXECUTOR_MEMORY),
("spark.executor.instances", self.SPARK_EXECUTOR_INSTANCES),
("spark.yarn.queue", self.SPARK_YARN_QUEUE),
("hbase.zookeeper.quorum", "192.168.0.1"),
("hbase.zookeeper.property.clientPort", "2181")
)
)
return SparkSession.builder.config(conf=spark_conf).getOrCreate()
@staticmethod
def create_spark_streaming_context(sc):
return StreamingContext(sparkContext=sc, batchDuration=60)
@staticmethod
def create_kafka_stream(ssc, group_id=None, topics=None):
kafka_params = {"metadata.broker.list": "192.168.0.1:9092", "group.id": group_id}
direct_stream = KafkaUtils.createDirectStream(ssc, topics=topics, kafkaParams=kafka_params)
return direct_stream
# -*- coding:utf-8 -*-
import os
import sys
import json
import time
BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, os.path.join(BASE_PATH))
print sys.path
from online import BaseSparkEnvironment
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
reload(sys)
sys.setdefaultencoding(default_encoding)
os.environ['JAVA_HOME'] = 'C:\Program Files\Java\jdk1.8.0_101'
os.environ['PYSPARK_PYTHON'] = 'F:\develop\python\Python27\python.exe'
os.environ['HADOOP_HOME'] = 'F:\develop\hadoop\hadoop-2.10.0'
os.environ['HADOOP_CONF_DIR'] = 'F:\develop\hadoop\hadoop-2.10.0-conf'
os.environ['SPARK_HOME'] = 'F:\develop\spark\spark-2.4.4-bin-hadoop2.7'
os.environ["PYSPARK_SUBMIT_ARGS"] = "--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.4.4 pyspark-shell"
class UserOnlineRecallSort(BaseSparkEnvironment):
def __init__(self):
self.SPARK_APP_NAME = 'user_online_recall_sort'
self.SPARK_MASTER_URL = 'local[*]'
self.ss = self.create_spark_session()
self.ssc = BaseSparkEnvironment.create_spark_streaming_context(self.ss.sparkContext)
self.ks = BaseSparkEnvironment.create_kafka_stream(self.ssc, group_id="", topics=["user-click-logger"])
def gen_user_recall_sort(self):
def insert_user_recall(rdd):
sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
import json
import pandas as pd
from pyspark.ml.linalg import DenseVector
from recoutils.hbase_utils import HBaseUtils
hbase_utils = HBaseUtils(host="192.168.0.1", port=9090, size=5)
from pyspark.ml.classification import LogisticRegressionModel
logistic_regression_model = LogisticRegressionModel.load(
"hdfs://192.168.0.1:9000/user/models/logistic_regression/lr.model")
for data in rdd.collect():
print data
if data['op_type'] in [2, 3, 4]:
# 获取文章相似文章列表
user_id = data['user_id']
article_id = data['article_id']
channel_id = data['channel_id']
art_sim_art_result = hbase_utils.read_rows("article_similarity",
[b"{}".format(article_id)],
columns=[b"sim"])
if art_sim_art_result:
# 排序TOPK
sorted_result = sorted(art_sim_art_result[0][1].items(), key=lambda item: item[1], reverse=True)
art_sim_art_list = [art_sim[0].split(":")[1] for art_sim in sorted_result][:5]
print "{} sim arts {}".format(article_id, art_sim_art_list)
# 获取历史召回数据
history_recall_list = hbase_utils.read_cells("history_recall",
"u:{}".format(user_id).encode(),
"channel:{}".format(channel_id).encode())
history_recommend_list = []
for history_recall in history_recall_list:
history_recommend_list.extend(eval(history_recall))
# 过滤历史召回数据
recommend_list = list(set(art_sim_art_list) - set(history_recommend_list))
print "recommend_list {}".format(recommend_list)
if recommend_list:
# 召回数据排序准备
recommend_data_list = []
try:
# 用户特征库中读取用户特征
user_features = eval(hbase_utils.read_row('ctr_user_features', str(user_id),
["cf:{}".format(channel_id)]).values()[0])
print "user {} features {}".format(user_id, user_features)
for r_article_id in recommend_list:
try:
# 文章特征库中读取文章特征
article_features = eval(hbase_utils.read_row('ctr_article_features', r_article_id,
["cf:{}".format(r_article_id)]).values()[0])
print "article {} features {}".format(article_id, article_features)
except Exception as e:
article_features = [0.0] * 111
# 收集特征
features = []
features.append(channel_id)
features.extend(user_features)
features.extend(article_features[1:])
recommend_data_list.append([user_id, r_article_id, DenseVector(features)])
except Exception as e:
print e.message
# 召回数据排序
recommend_data_df = self.ss.createDataFrame(
pd.DataFrame(recommend_data_list, columns=["user_id", "article_id", "features"]))
logistic_regression_result = logistic_regression_model.transform(recommend_data_df)
logistic_regression_result.select(["article_id", "probability"]).show()
# 点击概率倒排
logistic_regression_result = logistic_regression_result.rdd.map(
lambda row: (row.article_id, float(row.probability[1]))) \
.toDF(["article_id", "probability"]).sort("probability", ascending=False)
logistic_regression_result.show()
recommend_list = [i.article_id for i in logistic_regression_result.collect()]
print recommend_list
# 存储召回数据和历史召回数据
hbase_utils.insert("multiple_recall",
"u:{}".format(user_id).encode(),
{"online:{}".format(channel_id).encode(): json.dumps(
recommend_list).encode()})
hbase_utils.insert("history_recall",
"u:{}".format(user_id).encode(),
{"channel:{}".format(channel_id).encode(): json.dumps(
recommend_list).encode()})
self.ks.map(lambda x: json.loads(x[1])).foreachRDD(insert_user_recall)
if __name__ == '__main__':
user_online_recall_sort = UserOnlineRecallSort()
user_online_recall_sort.gen_user_recall_sort()
user_online_recall_sort.ssc.start()
user_online_recall_sort.ssc.awaitTermination()