数据说明
1. test_format1.csv 为最终要预测的 用户和商家数据
2. train_format2.csv 为带标签的训练数据
3. user_info_fromat1.csv 为用户基本信息
4. user_log_fromat1.csv 为全量5000多万的用户行为数据
5. user_log_fromat2.csv 为经过删减之后的,少量的400多万的用户行为数据
注意:我电脑比较垃圾跑不动,学习了他人帖子对数据进行了拆分才得以继续。
背景
由于消费者在“双十一”,“双十二”等购物节进行消费时,心仪的物品往往达不到消费券的使用额度,常常会选择一些本来不需要购买的物品, 而这些物品并不会成为消费者再次购买的物品。
Ø 商家 开展大型促销活动以吸引大量新买家 。然而,许多吸引过来的购买者只是一次性交易,这些促销对于转换为长期的顾客来说可能收效甚微。Ø 为了有针对性 地 向潜在重复购买用户开展营销活动,商家必须确定哪些人可以转化为重复购买者。Ø 通过对这些潜在的忠诚客户进行精细化营销,商家可以大大降低促销成本,提高投资回报率 。目前提供的项目数据是双十一及过去6个月内用户的日志信息,预测未来 6 个月内用户是否在同一商铺中重复购买。
1.数据探索与预处理
1.1
模型,数据导入
import gc
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
# 导入分析库
# 数据拆分
from sklearn.model_selection import train_test_split
# 同分布数据拆分
from sklearn.model_selection import StratifiedGroupKFold
import lightgbm as lgb
import xgboost as xgb
%%time
# 加载数据
# 用户行为日志
user_log = pd.read_csv('user_log_format2.csv', dtype = {'time_stamp':'str'})
# 用户画像
user_info = pd.read_csv('user_info_format1.csv')
# 训练数据和测试数据
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
1.2查看数据
print('---data shape---')
for data in [user_log, user_info, train_data, test_data]:
print(data.shape)
print('---data info ---')
for data in [user_log, user_info, train_data, test_data]:
print(data.info())
display(user_info.head())
display(train_data.head(),test_data.head())
1.3联合数据
all_data = all_data.merge(user_info, on='user_id', how='left')
display(all_data.shape,all_data.head());
del train_data,test_data,user_info
gc.collect();
%%time
display(user_log.info())
%%time
display(user_log.head())
1.4转换数据类型
%%time
# 用户行为数据类型转换
user_log['user_id'] = user_log['user_id'].astype('int32')
user_log['merchant_id'] = user_log['merchant_id'].astype('int32')
user_log['item_id'] = user_log['item_id'].astype('int32')
user_log['cat_id'] = user_log['cat_id'].astype('int32')
user_log['brand_id'].fillna(0, inplace=True)
user_log['brand_id'] = user_log['brand_id'].astype('int32')
user_log['time_stamp'] = pd.to_datetime(user_log['time_stamp'], format='%H%M')
user_log['action_type'] = user_log['action_type'].astype('int32')
display(user_log.info(),user_log.head())
display(all_data.isnull().sum())
# 缺失值填充
all_data['age_range'].fillna(0, inplace=True)
all_data['gender'].fillna(2, inplace=True)
all_data.isnull().sum()
all_data['age_range'] = all_data['age_range'].astype('int8')
all_data['gender'] = all_data['gender'].astype('int8')
all_data['label'] = all_data['label'].astype('str')
all_data['user_id'] = all_data['user_id'].astype('int32')
all_data['merchant_id'] = all_data['merchant_id'].astype('int32')
all_data.info()
1.5构建用户的特征工程
%%time
##### 特征处理
##### User特征处理
groups = user_log.groupby(['user_id'])
# 用户交互行为数量 u1
temp = groups.size().reset_index().rename(columns={0:'u1'})
all_data = all_data.merge(temp, on='user_id', how='left')
# 细分
# 使用 agg 基于列的聚合操作,统计唯一值个数 item_id, cat_id, merchant_id, brand_id
# 用户,交互行为:点了多少商品呢?
temp = groups['item_id'].agg([('u2', 'nunique')]).reset_index()
all_data = all_data.merge(temp, on='user_id', how='left')
# 用户,交互行为,具体统计:类目多少
temp = groups['cat_id'].agg([('u3', 'nunique')]).reset_index()
all_data = all_data.merge(temp, on='user_id', how='left')
temp = groups['merchant_id'].agg([('u4', 'nunique')]).reset_index()
all_data = all_data.merge(temp, on='user_id', how='left')
temp = groups['brand_id'].agg([('u5', 'nunique')]).reset_index()
all_data = all_data.merge(temp, on='user_id', how='left')
# 购物时间间隔特征 u6 按照小时
temp = groups['time_stamp'].agg([('F_time', 'min'), ('B_time', 'max')]).reset_index()
temp['u6'] = (temp['B_time'] - temp['F_time']).dt.seconds/3600
all_data = all_data.merge(temp[['user_id', 'u6']], on='user_id', how='left')
# 统计操作类型为0,1,2,3的个数
temp = groups['action_type'].value_counts().unstack().reset_index().rename(
columns={0:'u7', 1:'u8', 2:'u9', 3:'u10'})
all_data = all_data.merge(temp, on='user_id', how='left')
del temp,groups
gc.collect()
all_data.head()
1.6构建店铺的特征工程
%%time
##### 商家特征处理
groups = user_log.groupby(['merchant_id'])
# 商家被交互行为数量 m1
temp = groups.size().reset_index().rename(columns={0:'m1'})
all_data = all_data.merge(temp, on='merchant_id', how='left')
# 统计商家被交互的 user_id, item_id, cat_id, brand_id 唯一值
temp = groups['user_id', 'item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(
columns={
'user_id':'m2',
'item_id':'m3',
'cat_id':'m4',
'brand_id':'m5'})
all_data = all_data.merge(temp, on='merchant_id', how='left')
# 统计商家被交互的 action_type 唯一值
temp = groups['action_type'].value_counts().unstack().reset_index().rename(
columns={0:'m6', 1:'m7', 2:'m8', 3:'m9'})
all_data = all_data.merge(temp, on='merchant_id', how='left')
del temp
gc.collect()
1.7将用户和店铺特征联合
%%time
##### 用户+商户特征
groups = user_log.groupby(['user_id', 'merchant_id'])
# 用户在不同商家交互统计
temp = groups.size().reset_index().rename(columns={0:'um1'})
all_data = all_data.merge(temp, on=['user_id', 'merchant_id'], how='left')
# 统计用户在不同商家交互的 item_id, cat_id, brand_id 唯一值
temp = groups['item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(
columns={
'item_id':'um2',
'cat_id':'um3',
'brand_id':'um4'})
all_data = all_data.merge(temp, on=['user_id', 'merchant_id'], how='left')
# 统计用户在不同商家交互的 action_type 唯一值
temp = groups['action_type'].value_counts().unstack().reset_index().rename(
columns={
0:'um5',
1:'um6',
2:'um7',
3:'um8'})
all_data = all_data.merge(temp, on=['user_id', 'merchant_id'], how='left')
# 统计用户在不同商家购物时间间隔特征 um9 按照小时
temp = groups['time_stamp'].agg([('F_time', 'min'), ('B_time', 'max')]).reset_index()
temp['um9'] = (temp['B_time'] - temp['F_time']).dt.seconds/3600
all_data = all_data.merge(temp[['user_id','merchant_id','um9']], on=['user_id', 'merchant_id'], how='left')
del temp,groups
gc.collect()
此时可以构建出37个特征。
1.8点击比
all_data['r1'] = all_data['u9']/all_data['u7'] # 用户购买点击比
all_data['r2'] = all_data['m8']/all_data['m6'] # 商家购买点击比
all_data['r3'] = all_data['um7']/all_data['um5'] # 不同用户不同商家购买点击比
display(all_data.head())
1.9 填充空值
all_data.fillna(0, inplace=True)
all_data.isnull().sum()
1.10修改年龄字段,并将性别字段转换后合并
%%time
# 修改age_range字段名称为 age_0, age_1, age_2... age_8
# 独立编码
temp = pd.get_dummies(all_data['age_range'], prefix='age')
display(temp.head(10))
all_data = pd.concat([all_data, temp], axis=1)
# 性别转换
temp = pd.get_dummies(all_data['gender'], prefix='g')
all_data = pd.concat([all_data, temp], axis=1) # 列进行合并
# 删除原数据
all_data.drop(['age_range', 'gender'], axis=1, inplace=True)
del temp
gc.collect()
最终得到47个字段
1.11保存数据
%%time
# train_data、test-data
train_data = all_data[all_data['origin'] == 'train'].drop(['origin'], axis=1)
test_data = all_data[all_data['origin'] == 'test'].drop(['label', 'origin'], axis=1)
train_data.to_csv('train_data1.csv')
test_data.to_csv('test_data.csv')
2.1算法建模
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
# 假设X_train, X_test, y_train, y_test是已经准备好的训练集和测试集数据
# X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
# 训练数据和目标值
train_X, train_y = train_data.drop(['label'], axis=1), train_data['label']
# 数据拆分保留20%作为测试数据
X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size=.2)
# 选择梯度提升树模型
gbm = GradientBoostingClassifier()
# 训练模型
gbm.fit(X_train, y_train)
# 预测测试集的概率
y_scores = gbm.predict_proba(X_valid)[:, 1]
y_valid = y_valid.map(lambda x: float(x))
# 计算ROC曲线和AUC值
fpr, tpr, thresholds = roc_curve(y_valid, y_scores)
roc_auc = auc(fpr, tpr)
# 绘制ROC曲线
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()
得到可视化图形,看出acu值
2.2 LGB模型
# 训练数据和目标值
train_X, train_y = train_data.drop(['label'], axis=1), train_data['label']
# 数据拆分保留20%作为测试数据
X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size=.2)
import lightgbm as lgb
import matplotlib.pyplot as plt
# 训练数据和目标值
train_X, train_y = train_data.drop(['label'], axis=1), train_data['label']
# 数据拆分保留20%作为测试数据
X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size=.2)
def lgb_train(X_train, y_train, X_valid, y_valid, verbose=True):
model_lgb = lgb.LGBMClassifier(
max_depth=10,
n_estimators=5000,
min_child_weight=100,
colsample_bytree=0.7,
subsample=0.9,
learning_rate=0.1,
verbose=1 # 设置 verbose 参数
)
# 创建一个早停回调函数
early_stopping = lgb.early_stopping(stopping_rounds=10)
# 使用 fit() 方法训练模型,并传递 callbacks 参数
model_lgb.fit(
X_train,
y_train,
eval_metric='auc',
eval_set=[(X_train, y_train), (X_valid, y_valid)],
callbacks=[early_stopping]
)
print(model_lgb.best_score_['valid_1']['auc'])
# 获取训练过程中的评估指标
evals_result = model_lgb.evals_result_
# 提取验证集上的 AUC 值
valid_auc = evals_result['valid_1']['auc']
# 可视化 AUC 值
plt.figure(figsize=(10, 6))
plt.plot(valid_auc, label='Validation AUC')
plt.xlabel('Number of Trees')
plt.ylabel('AUC')
plt.title('LightGBM Training')
plt.legend()
plt.grid(True)
plt.show()
return model_lgb
model_lgb = lgb_train(X_train.values, y_train, X_valid.values, y_valid, verbose=True)
如果你的python版本不能运行fit方法,可以参考我的代码,这个是可以跑的,在jupyter上。
2.3 xgb模型
import xgboost as xgb
def xgb_train(X_train, y_train, X_valid, y_valid, verbose=True):
# 将标签列中的字符串形式的浮点数值转换为整数类型
y_train = y_train.replace({'0.0': 0, '1.0': 1})
y_valid = y_valid.replace({'0.0': 0, '1.0': 1})
model_xgb = xgb.XGBClassifier(
max_depth=10, # raw8
n_estimators=5000,
min_child_weight=300,
colsample_bytree=0.7,
subsample=0.9,
learing_rate=0.1)
model_xgb.fit(
X_train,
y_train,
eval_metric='auc',
eval_set=[(X_train, y_train), (X_valid, y_valid)],
verbose=verbose,
early_stopping_rounds=10) # 早停法,如果auc在10epoch没有进步就stop
print(model_xgb.best_score)
return model_xgb
model_xgb = xgb_train(X_train, y_train, X_valid, y_valid, verbose=False)
这里没有作图,而是直接得出得分~0.6654
2.4开始复购预测
%%time
prob = model_lgb.predict_proba(test_data.values) # 预测
submission = pd.read_csv('test.csv')
# 复购的概率
submission['prob'] = pd.Series(prob[:,1]) # 预测数据赋值给提交数据
display(submission.head())
submission.to_csv('submission_lgb.csv', index=False)
del submission
gc.collect()
2.5补充一个早停回的方法,如果前面的LGB算法跑不出来可以参考这个
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
import pandas as pd
skf = StratifiedKFold(n_splits=100, shuffle=True, random_state=42)
train_X, train_y = train_data.drop(['label'], axis=1), train_data['label']
# 转换 y 为一维数组
train_y = train_y.ravel()
pred_lgbms = [] # 列表,接受目标值,100轮,平均值
for i, (train_index, valid_index) in enumerate(skf.split(train_X, train_y)):
print('\n=========LGB training use Data {}/10===========\n'.format(i + 1))
X_train, X_valid = train_X.iloc[train_index], train_X.iloc[valid_index]
y_train, y_valid = train_y[train_index], train_y[valid_index]
model_lgb = lgb.LGBMClassifier(
max_depth=10,
n_estimators=5000,
min_child_weight=100,
colsample_bytree=0.7,
subsample=0.9,
learning_rate=0.1,
verbose=1 # 设置 verbose 参数
)
# 创建一个早停回调函数
early_stopping = lgb.early_stopping(stopping_rounds=100)
# 使用 fit() 方法训练模型,并传递 callbacks 参数
model_lgb.fit(
X_train,
y_train,
eval_metric='auc',
eval_set=[(X_train, y_train), (X_valid, y_valid)],
callbacks=[early_stopping]
)
print(model_lgb.best_score_['valid_1']['auc'])
pred = model_lgb.predict_proba(test_data.values)
pred = pd.DataFrame(pred[:, 1]) # 将预测概率(复购)去处理,转换成DataFrame
pred_lgbms.append(pred)
# 求10轮平均值生成预测结果,保存
# 每一轮的结果,作为一列,进行了添加
pred_lgbms = pd.concat(pred_lgbms, axis=1) # 级联,列进行级联
# 加载提交数据
submission = pd.read_csv('test_data1.csv')
submission['prob'] = pred_lgbms.mean(axis=1) # 100轮训练的平均值
submission.to_csv('submission_KFold_lgb.csv', index=False)
由于设置了很多轮,需要你自己去找出最优效果