京东用户下单预测

最新推荐文章于 2023-02-18 02:02:47 发布

weixin_43961606

最新推荐文章于 2023-02-18 02:02:47 发布

阅读量1k

点赞数 1

文章标签：机器学习 python 数据分析

本文链接：https://blog.csdn.net/weixin_43961606/article/details/107466439

版权

该博客介绍了京东用户下单预测的解决方案，通过构建二分类模型来判断用户是否会在商品上点击或下单。采用User_Model对用户进行建模，数据预处理、特征提取和模型训练是关键步骤。代码结构清晰，包括工具函数、数据预处理和模型训练部分。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

背景与方案说明

用户对商品点击或者下单，体现了极强的用户兴趣倾向，对它的刻画可以用于最终推荐结果展示与搜索结果的排序。
在这种场景下的推荐搜索个性化排序，解决方法是构建二分类模型，判断一个user是否会在一个item上发生点击/下单（训练集根据实际用户行为设为1或者0），在新的场景下，使用二分类模型可以判断[user_feature,item_feature]作为输入时，输出概率p，根据概率大小进行推荐和排序。

典型的建模方式是：

		第一种，以【用户特征向量，商品特征向量】的拼接为输入x，以用户是否在该商品上发生点击/下单为标签y，构建二分模型类型
		第二种，因为用户x商品的数量非常大，我们也可以单独构建用户模型，商品模型，即在上一种建模方式中，只用用户特征或者商品特征，再考虑聚合。

这里实现的方式，是构建User_Model，对用户进行建模

数据说明
在这里插入图片描述

代码说明：

utils.py：工具函数
gen_features.py:数据预处理，提取特征，产生训练集和测试集
user_modelling.py:代码运行入口，训练模型，产生结果

代码结构图：
在这里插入图片描述

代码

import pandas as pd
import pickle, os
from datetime import datetime, timedelta

action_1_path = "../data/JData_Action_201602.csv"
action_2_path = "../data/JData_Action_201603.csv"
action_3_path = "../data/JData_Action_201604.csv"
action_cate8_path = '../cache/actions_cate8.pkl'
comment_path = "../data/JData_Comment.csv"
product_path = "../data/JData_Product.csv"
user_path = "../data/JData_User.csv"

def extract_cate8():
    """
    筛选出cate=8的交互记录
    """
    action_1 = pd.read_csv(action_1_path)
    action_2 = pd.read_csv(action_2_path)
    action_3 = pd.read_csv(action_3_path)
    actions = pd.concat([action_1, action_2, action_3])
    actions = actions[actions['cate'] == 8]
    del actions['cate']
    pickle.dump(actions, open(action_cate8_path, 'wb'))

    return actions

def gen_action(start_date, end_date):
    """
    产生指定时间区间的行为数据
    """
    if os.path.exists(action_cate8_path):
        actions = pickle.load(open(action_cate8_path, 'rb'))
    else:
        actions = extract_cate8()
    actions = actions[(actions.time >= start_date) & (actions.time < end_date)]
    return actions

def gen_all_cate_action(start_date, end_date):
    """
    产生指定时间区间的all-cate行为数据
    """
    action_1 = pd.read_csv(action_1_path)
    action_2 = pd.read_csv(action_2_path)
    action_3 = pd.read_csv(action_3_path)
    actions = pd.concat([action_1, action_2, action_3])
    actions = actions[(actions.time >= start_date) & (actions.time < end_date)]
    return actions

def gen_labels(act_start_date, act_end_date, span=5):
    """
    产生交互日区间内的购买情况
    """
    act_end_date = datetime.strptime(act_start_date, '%Y-%m-%d') + timedelta(days=span)
    act_end_date = act_end_date.strftime('%Y-%m-%d')
    dump_path = '../cache/labels/labels_%s_%s.pkl' % (act_start_date, act_end_date)
    if os.path.exists(dump_path):
        labels = pickle.load(open(dump_path, 'rb'))
    else:
        actions = gen_action(act_start_date, act_end_date)
        actions = actions[actions['type'] == 4]
        labels = actions[['user_id']].drop_duplicates()
        labels['label'] = 1
        print('buy user num is:', actions.shape[0])
        #pickle.dump(labels, open(dump_path, 'wb'))
    return labels

def gen_sample(end_date, span=5):
    """
    产生购买日前n天的交互用户
    """
    start_date = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=span)
    start_date = start_date.strftime('%Y-%m-%d')
    dump_path = '../cache/samples/samples_%s_%s.pkl' % (start_date, end_date)
    if os.path.exists(dump_path):
        samples = pickle.load(open(dump_path, 'rb'))
    else:
        actions = gen_action(start_date, end_date)
        samples = actions[['user_id']].drop_duplicates()
        print('samples num is:', samples.shape[0])
        #pickle.dump(samples, open(dump_path, 'wb'))
    return samples

def gen_truth(act_start_date, act_end_date):
    """
    产生交互日区间内的实际购买情况
    """
    dump_path = '../cache/labels/truth_%s_%s.pkl' % (act_start_date, act_end_date)
    if os.path.exists(dump_path):
        actions = pickle.load(open(dump_path, 'rb'))
    else:
        actions = gen_action(act_start_date, act_end_date)
        actions = actions[actions['type'] == 4]
        actions = actions[['user_id']]
        pickle.dump(actions, open(dump_path, 'wb'))
    return actions

def gen_submission(res, top):
    """
    产生最终提交的数据
    """
    res = res.sort_values(by='prob', ascending=False)
    res = res.head(top)
    res = res[['user_id']]
    res['sku_id'] = -1 #补商品为全-1
    res['user_id'] = res['user_id'].astype(int)
    res.to_csv('../cache/pred_user.csv', index=False, index_label=False)
    print('res num:', res.shape[0])
    return res

def user_score(index, pre_Y, truth, threshold):
    """
    计算F11得分
    """
    pred = gen_submission(index, pre_Y, threshold)
    truth = truth['user_id'].unique()
    pred = pred['user_id'].unique()
    pos, neg = 0,0
    for user_id in pred:
        if user_id in truth:
            pos += 1
        else:
            neg += 1
    if pos == 0:
        print(0)
        return
    print('hits', pos)
    Precise = 1.0 * pos / ( pos + neg)
    Recall = 1.0 * pos / len(truth)
    F11 = 6.0 * Precise * Recall / (5.0 * Recall + Precise)
    print('F11 score', F11)

if __name__ == '__main__':
    pass

import numpy as np
from dateutil.parser import parse
from utils import *


def get_hours(start_date, end_date):
    d = parse(end_date) - parse(start_date)
    hours = int(d.days*24+d.seconds/3600)
    return hours

def gen_basic_user_feat():
    """
    用户基本特征
    """
    dump_path = '../cache/user_feature/basic_user_feat.pkl'
    if os.path.exists(dump_path):
        user = pickle.load(open(dump_path, 'rb'))
    else:
        user = pd.read_csv(user_path, encoding='gbk')
        user['age'] = user['age'].replace({
   '-1':0,
                                           '15岁以下':1,
                                           '16-25岁':2,
                                           '26-35岁':3,
                                           '36-45岁':4,
                                           '46-55岁':5,
                                           '56岁以上':6,
                                           })
        age_df = pd.get_dummies(user["age"], prefix="age")
        sex_df = pd.get_dummies(user["sex"], prefix="sex")
        user_lv_df = pd.get_dummies(user["user_lv_cd"], prefix="user_lv_cd")
        user

最低0.47元/天解锁文章