一个请求都已一个唯一标识字段,这样我们能拿到,这个请求曝光了哪些商品,点击了哪些商品,这个请求发生那一刻的特征。
%spark_recommend.pyspark
import pandas as pd
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.ml.linalg import SparseVector, Vectors, VectorUDT, DenseVector
from datetime import datetime, timedelta
from typing import Tuple
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import udf, lit,json_tuple, count, first, col,asc,desc, sum as spark_sum # pylint: disable=no-name-in-module
from pyspark.sql.types import BooleanType,LongType
from itertools import zip_longest
DWD_MD_APP_LOG = 'dwd.dwd_md_app_log_nocheat'
is_scale =True
def _compare_version(version):
i = 0
for j, c in enumerate(version):
if c is ".":
yield int(version[i:j])
i = j + 1
yield int(version[i:])
def compare_version2(version1: str,version2: str):
nums1 = _compare_version(version1)
nums2 = _compare_version(version2)
for a, b in zip_longest(nums1, nums2, fillvalue=0):
if a != b:
return a > b
return True
compare_version = udf(compare_version2, BooleanType())
def scale_data(data_train: DataFrame, data_test: DataFrame):
data_train = data_train.withColumnRenamed("features", "_features")
data_test = data_test.withColumnRenamed("features", "_features")
# 将features归一化
scaler = MinMaxScaler(inputCol="_features",
outputCol="features").fit(data_train)
data_train = scaler.transform(data_train).select("features", "label")
data_test = scaler.transform(data_test).select("features", "label")
return data_train, data_test, scaler
def _get_product_search_label(spark: SparkSession, start: str, end: str,
name: str, min_ver="7.29.0") -> DataFrame:
"""拼接搜索页面商品的曝光、点击。
Arguments:
spark {SparkSession}
start {str} -- dp字段开始日期,yyyy-mm-dd。
end {str} -- dp字段结束日期,yyyy-mm-dd。
name {str} -- "exposure"代表曝光, "click"代表点击。
Keyword Arguments:
min_ver {str} -- 最小版本号 (default: {"7.29.0"})
Returns:
DataFrame -- ["request_key", "dp", "product_id"]
"""
assert name in ("exposure", "click")
if name == "exposure":
from_action_id = 959
if name == "click":
from_action_id = 958
data = spark.table(DWD_MD_APP_LOG)
data = data.withColumn("min_ver", lit(min_ver))
data = data.filter(data.dp >= start)\
.filter(data.dp < end)\
.filter(data.type == 1)\
.filter(data.curr_page_id == 51)\
.filter(data.from_action_id == from_action_id)\
.filter(compare_version(data.ver, data.min_ver))\
.selectExpr("from_action_ext['exposure_ext'] as exposure_ext",
"from_action_ext['product_id'] as product_id",
"dp")\
.select(json_tuple("exposure_ext", "request_key").alias("request_key"), "product_id", "dp")\
.filter(col("request_key").isNotNull())\
.distinct()
return data
def get_product_search_label(spark: SparkSession, start: str, end: str) -> DataFrame:
"""拼接搜索页面商品的曝光、点击。
Arguments:
spark {SparkSession}
start {str} -- dp字段开始日期,yyyy-mm-dd。
end {str} -- dp字段结束日期,yyyy-mm-dd。
Returns:
DataFrame -- ["request_key", "label", "dp", "product_id"]
"""
exposure = _get_product_search_label(spark, start, end, "exposure")
click = _get_product_search_label(spark, start, end, "click")\
.withColumn("label", lit(1)).drop("dp", "product_id")
ret = exposure.join(click, on="request_key", how="left")\
.select("request_key", "label", "dp", "product_id")\
.fillna(0)
return ret
def get_product_search_features(spark: SparkSession, start: str, end: str) -> DataFrame:
"""搜索页面商品的日志。
Arguments:
spark {SparkSession}
start {str} -- dp字段开始日期,yyyy-mm-dd。
end {str} -- dp字段结束日期,yyyy-mm-dd。
Returns:
DataFrame -- 见变量col_nm
"""
col_nm = ["ctr", "real_on_sale_yn", "cvr_min_est", "sold_cnt", "district_2_matched",
"district_1_matched", "request_key", "explain"]
data = spark.table("ods_log.ods_log_feed_elastic_search_feature_log")
data = data.filter(data.dp >= start)\
.filter(data.dp < end)\
.select(json_tuple(*(["value"] + col_nm)).alias(*col_nm))\
.filter(col("explain") != "{}")
score_col_nm = ["title_import", "titleimport", "all_string", "hospital_name",
"title_bad", "doctor_name", "district_name", "item_all",
"menu1_name", "menu2_name"]
data = data.select(
*col_nm, json_tuple(*(["explain"] + score_col_nm)).alias(*score_col_nm))
data = data.drop("explain")
return data
def merge_features_label(spark, start: str, end: str) -> DataFrame:
"""拼接商品搜索搜索的特征和label。
Arguments:
spark {SparkSession}
start {str} -- dp字段开始日期,yyyy-mm-dd。
end {str} -- dp字段结束日期,yyyy-mm-dd。
Returns:
DataFrame
"""
# 合并features和label
features = get_product_search_features(spark, start, end)
label = get_product_search_label(spark, start, end)
# 加入商品价格,按照95分位点20000截断
product = spark.table('dim.dim_product_info').select('product_id', F.when(col(
"price_online") <= 20000, col("price_online")).otherwise(20000).alias('price_online'))
raw_data = label.join(features, on="request_key")\
.join(product, on="product_id", how="left")\
.drop("request_key", 'product_id')
# 字符串转为Float型,注意dp不能转换,否则会变为空值。
data = raw_data.select(*(col(x).cast("float").alias(x) if x != "dp" else col(x)
for x in raw_data.columns))
# 填补缺失值
data = data.fillna(0.0)
# 将features向量化
input_cols = data.columns
input_cols.remove("label")
input_cols.remove("dp")
df_assembler = VectorAssembler(inputCols=input_cols, outputCol='features')
data = df_assembler.transform(data).select("features", "label", "dp")
return data, input_cols
def train_test_split(data: DataFrame, n_test_days=2) -> Tuple[DataFrame, DataFrame]:
"""[summary]
Arguments:
data {DataFrame} -- [description]
Keyword Arguments:
n_test_days {int} -- [description] (default: {2})
Returns:
Tuple[DataFrame, DataFrame] -- [description]
"""
dps = [x[0] for x in data.select(data.dp).distinct().collect()]
print("dps", dps, "\n", "n_test_days", n_test_days, "\n")
assert len(dps) > n_test_days, "训练数据不超过%d天!" % n_test_days
split = sorted(dps)[-n_test_days]
data_train = data.filter(data.dp < split).drop("dp")
data_test = data.filter(data.dp >= split).drop("dp")
return data_train, data_test
n_days=5
spark = SparkSession.builder.appName("search_statistics").getOrCreate()
time_fmt = "%Y-%m-%d"# 起止时间
end = datetime.now().strftime(time_fmt)
start = (datetime.now() - timedelta(days=n_days)).strftime(time_fmt)
# 合并features和label
features = get_product_search_features(spark, start, end)
label = get_product_search_label(spark, start, end)
# 加入商品价格,按照95分位点20000截断
product = spark.table('dim.dim_product_info').select('product_id', F.when(col("price_online") <= 20000, col("price_online")).otherwise(20000).alias('price_online'))
raw_data = label.join(features, on="request_key")\
.join(product, on="product_id", how="left")\
.drop("request_key", 'product_id')
# 字符串转为Float型,注意dp不能转换,否则会变为空值。
data = raw_data.select(*(col(x).cast("float").alias(x) if x != "dp" else col(x)
for x in raw_data.columns))
# 填补缺失值
data = data.fillna(0.0)
# 将features向量化
input_cols = data.columns
input_cols.remove("label")#只保留特征
input_cols.remove("dp")
df_assembler = VectorAssembler(inputCols=input_cols, outputCol='features')#合并成一列
data = df_assembler.transform(data).select("features", "label", "dp")#Row(SparseVector(17, {0: 0.0859, 3: 266.0, 4: 1.0, 5: 1.0, 8: 7.6944, 16: 980.0}), label=0.0, dp='2019-10-01'),
# 切分训练集测试集
data_train, data_test = train_test_split(data, n_test_days=1) #根据dp去划分
scaler = None
# 数据归一化
if is_scale:
data_train, data_test, scaler = scale_data(data_train, data_test)
data_train.cache()
data_test.cache()
# # 训练模型
# clf = train_model(data_train)
# # 评估模型
# evaluate_model(clf, data_train, data_test, input_cols, scaler)