LR模型-商品搜索实战

一个请求都已一个唯一标识字段,这样我们能拿到,这个请求曝光了哪些商品,点击了哪些商品,这个请求发生那一刻的特征。

%spark_recommend.pyspark
import pandas as pd
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.ml.linalg import SparseVector, Vectors, VectorUDT, DenseVector
from datetime import datetime, timedelta
from typing import Tuple
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import udf, lit,json_tuple, count, first, col,asc,desc, sum as spark_sum  # pylint: disable=no-name-in-module
from pyspark.sql.types import BooleanType,LongType
from itertools import zip_longest

DWD_MD_APP_LOG = 'dwd.dwd_md_app_log_nocheat'
is_scale =True

def _compare_version(version):
    i = 0
    for j, c in enumerate(version):
        if c is ".":
            yield int(version[i:j])
            i = j + 1
    yield int(version[i:])
def compare_version2(version1: str,version2: str):
    nums1 = _compare_version(version1)
    nums2 = _compare_version(version2)
    for a, b in zip_longest(nums1, nums2, fillvalue=0):
        if a != b:
            return a > b
    return True
compare_version = udf(compare_version2, BooleanType())

def scale_data(data_train: DataFrame, data_test: DataFrame):
    data_train = data_train.withColumnRenamed("features", "_features")
    data_test = data_test.withColumnRenamed("features", "_features")
    # 将features归一化
    scaler = MinMaxScaler(inputCol="_features",
                          outputCol="features").fit(data_train)
    data_train = scaler.transform(data_train).select("features", "label")
    data_test = scaler.transform(data_test).select("features", "label")
    return data_train, data_test, scaler
    
def _get_product_search_label(spark: SparkSession, start: str, end: str,
                              name: str, min_ver="7.29.0") -> DataFrame:
    """拼接搜索页面商品的曝光、点击。

    Arguments:
        spark {SparkSession}
        start {str} -- dp字段开始日期,yyyy-mm-dd。
        end {str} -- dp字段结束日期,yyyy-mm-dd。
        name {str} -- "exposure"代表曝光, "click"代表点击。

    Keyword Arguments:
        min_ver {str} -- 最小版本号 (default: {"7.29.0"})

    Returns:
        DataFrame -- ["request_key", "dp", "product_id"]
    """
    assert name in ("exposure", "click")
    if name == "exposure":
        from_action_id = 959
    if name == "click":
        from_action_id = 958
    data = spark.table(DWD_MD_APP_LOG)
    data = data.withColumn("min_ver", lit(min_ver))
    data = data.filter(data.dp >= start)\
        .filter(data.dp < end)\
        .filter(data.type == 1)\
        .filter(data.curr_page_id == 51)\
        .filter(data.from_action_id == from_action_id)\
        .filter(compare_version(data.ver, data.min_ver))\
        .selectExpr("from_action_ext['exposure_ext'] as exposure_ext",
                    "from_action_ext['product_id'] as product_id",
                    "dp")\
        .select(json_tuple("exposure_ext", "request_key").alias("request_key"), "product_id", "dp")\
        .filter(col("request_key").isNotNull())\
        .distinct()

    return data
    
def get_product_search_label(spark: SparkSession, start: str, end: str) -> DataFrame:
    """拼接搜索页面商品的曝光、点击。

    Arguments:
        spark {SparkSession}
        start {str} -- dp字段开始日期,yyyy-mm-dd。
        end {str} -- dp字段结束日期,yyyy-mm-dd。

    Returns:
        DataFrame -- ["request_key", "label", "dp", "product_id"]
    """
    exposure = _get_product_search_label(spark, start, end, "exposure")
    click = _get_product_search_label(spark, start, end, "click")\
        .withColumn("label", lit(1)).drop("dp", "product_id")
    ret = exposure.join(click, on="request_key", how="left")\
        .select("request_key", "label", "dp", "product_id")\
        .fillna(0)

    return ret
    
def get_product_search_features(spark: SparkSession, start: str, end: str) -> DataFrame:
    """搜索页面商品的日志。

    Arguments:
        spark {SparkSession}
        start {str} -- dp字段开始日期,yyyy-mm-dd。
        end {str} -- dp字段结束日期,yyyy-mm-dd。

    Returns:
        DataFrame -- 见变量col_nm
    """
    col_nm = ["ctr", "real_on_sale_yn", "cvr_min_est", "sold_cnt", "district_2_matched",
              "district_1_matched", "request_key", "explain"]
    data = spark.table("ods_log.ods_log_feed_elastic_search_feature_log")
    data = data.filter(data.dp >= start)\
        .filter(data.dp < end)\
        .select(json_tuple(*(["value"] + col_nm)).alias(*col_nm))\
        .filter(col("explain") != "{}")
    score_col_nm = ["title_import", "titleimport", "all_string", "hospital_name",
                    "title_bad", "doctor_name", "district_name", "item_all",
                    "menu1_name", "menu2_name"]
    data = data.select(
        *col_nm, json_tuple(*(["explain"] + score_col_nm)).alias(*score_col_nm))
    data = data.drop("explain")

    return data
    
def merge_features_label(spark, start: str, end: str) -> DataFrame:
    """拼接商品搜索搜索的特征和label。

    Arguments:
        spark {SparkSession}
        start {str} -- dp字段开始日期,yyyy-mm-dd。
        end {str} -- dp字段结束日期,yyyy-mm-dd。

    Returns:
        DataFrame
    """
    # 合并features和label
    features = get_product_search_features(spark, start, end)
    label = get_product_search_label(spark, start, end)
    # 加入商品价格,按照95分位点20000截断
    product = spark.table('dim.dim_product_info').select('product_id', F.when(col(
        "price_online") <= 20000, col("price_online")).otherwise(20000).alias('price_online'))

    raw_data = label.join(features, on="request_key")\
        .join(product, on="product_id", how="left")\
        .drop("request_key", 'product_id')
    # 字符串转为Float型,注意dp不能转换,否则会变为空值。
    data = raw_data.select(*(col(x).cast("float").alias(x) if x != "dp" else col(x)
                             for x in raw_data.columns))
    # 填补缺失值
    data = data.fillna(0.0)
    # 将features向量化
    input_cols = data.columns
    input_cols.remove("label")
    input_cols.remove("dp")
    df_assembler = VectorAssembler(inputCols=input_cols, outputCol='features')
    data = df_assembler.transform(data).select("features", "label", "dp")
    return data, input_cols
def train_test_split(data: DataFrame, n_test_days=2) -> Tuple[DataFrame, DataFrame]:
    """[summary]

    Arguments:
        data {DataFrame} -- [description]

    Keyword Arguments:
        n_test_days {int} -- [description] (default: {2})

    Returns:
        Tuple[DataFrame, DataFrame] -- [description]
    """
    dps = [x[0] for x in data.select(data.dp).distinct().collect()]
    print("dps", dps, "\n", "n_test_days", n_test_days, "\n")
    assert len(dps) > n_test_days, "训练数据不超过%d天!" % n_test_days
    split = sorted(dps)[-n_test_days]
    data_train = data.filter(data.dp < split).drop("dp")
    data_test = data.filter(data.dp >= split).drop("dp")

    return data_train, data_test

n_days=5
spark = SparkSession.builder.appName("search_statistics").getOrCreate()
time_fmt = "%Y-%m-%d"# 起止时间
end = datetime.now().strftime(time_fmt)
start = (datetime.now() - timedelta(days=n_days)).strftime(time_fmt)
# 合并features和label
features = get_product_search_features(spark, start, end)
label = get_product_search_label(spark, start, end)
# 加入商品价格,按照95分位点20000截断
product = spark.table('dim.dim_product_info').select('product_id', F.when(col("price_online") <= 20000, col("price_online")).otherwise(20000).alias('price_online'))
raw_data = label.join(features, on="request_key")\
    .join(product, on="product_id", how="left")\
    .drop("request_key", 'product_id')
# 字符串转为Float型,注意dp不能转换,否则会变为空值。
data = raw_data.select(*(col(x).cast("float").alias(x) if x != "dp" else col(x)
                         for x in raw_data.columns))
# 填补缺失值
data = data.fillna(0.0)
# 将features向量化
input_cols = data.columns
input_cols.remove("label")#只保留特征
input_cols.remove("dp")
df_assembler = VectorAssembler(inputCols=input_cols, outputCol='features')#合并成一列
data = df_assembler.transform(data).select("features", "label", "dp")#Row(SparseVector(17, {0: 0.0859, 3: 266.0, 4: 1.0, 5: 1.0, 8: 7.6944, 16: 980.0}), label=0.0, dp='2019-10-01'),



# 切分训练集测试集
data_train, data_test = train_test_split(data, n_test_days=1) #根据dp去划分
scaler = None
# 数据归一化
if is_scale:
    data_train, data_test, scaler = scale_data(data_train, data_test)
data_train.cache()
data_test.cache()
# # 训练模型
# clf = train_model(data_train)
# # 评估模型
# evaluate_model(clf, data_train, data_test, input_cols, scaler)

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值