用户留存预测挑战赛

用户留存预测挑战赛

爱奇艺AI竞赛官网

第一步:预览比赛数据集

# 导包
import pandas as pd
import numpy as np

1. 导入数据

app_launch_logs = pd.read_csv('app_launch_logs.csv')
user_interaction_data = pd.read_csv('user_interaction_data.csv')
user_playback_data = pd.read_csv('user_playback_data.csv')
user_portrait_data = pd.read_csv('user_portrait_data.csv') # 大概率不用修改即可使用
video_related_data = pd.read_csv('video_related_data.csv') # 大概率用不上
sample_a = pd.read_csv('sample-a.csv', header=None)
test_a = pd.read_csv('test-a.csv')

2. 查看缺失值

app_launch_logs.isnull().sum()
user_id        0
launch_type    0
date           0
dtype: int64
user_interaction_data.isnull().sum()
user_id          0
item_id          0
interact_type    0
date             0
dtype: int64
user_playback_data.isnull().sum()
user_id         0
item_id     60483
playtime        0
date            0
dtype: int64
user_portrait_data.isnull().sum()
user_id                  0
device_type            339
device_ram           37772
device_rom           28434
sex                   6447
age                   7738
education            11003
occupation_status     7983
territory_code       37281
dtype: int64
video_related_data.isnull().sum()
item_id           66
duration      364078
father_id    2820142
tag_list      592337
cast         3775461
dtype: int64
test_a['end_date'].duplicated().sum()
14939

3. 查看待使用数据集大小

app_launch_logs.shape
(8493878, 3)
user_interaction_data.shape
(198608, 4)
user_playback_data.shape
(71046035, 4)
user_portrait_data.shape
(596906, 9)
test_a.shape
(15001, 2)

第二步:提取数据集特征

# 创建数据表
col = list(app_launch_logs.columns[1:-1]) + ['launch_date']  + ['playback_item_duration','playback_item_father_id'] + list(user_playback_data.columns[2:-1]) + ['playback_date'] + list(user_portrait_data.columns[1:]) + list(test_a.columns[1:])
df = pd.DataFrame(index = list(test_a.user_id), columns=col)

最关键的一个函数,运行时长要几个小时
特征的选取方式有很多种,这个地方还可以将用户留存时长变为开始日期和结束日期

def selectFeatures(tables):
    for i in df.index:
        f = 0
        for j in tables:
            f += 1
            t = j.loc[j['user_id'] == i]
            if f == 1:
                s1 = np.mean(t.launch_type)
                s2 = max(t.date) - min(t.date)
                df.loc[i, 'launch_type'] = s1
                df.loc[i, 'launch_date'] = s2
            if f == 2:
                for p in t.item_id:
                    o = video_related_data.loc[video_related_data['item_id'] == p]
                    df.loc[i, 'playback_item_duration'] = np.mean(o.duration)
                    df.loc[i, 'playback_item_father_id'] = np.mean(o.father_id)
                s1 = np.mean(t.playtime)
                s2 = np.max(t.date) - np.min(t.date)
                df.loc[i, 'playtime'] = s1
                df.loc[i, 'playback_date'] = s2
            if f == 3:
                s = np.array(t).ravel()
                if len(s) !=0:
                    for k in range(len(user_portrait_data.columns[1:])):
                        df.loc[i, user_portrait_data.columns[k+1]] = s[k+1]
            if f == 4:
                df.end_date = np.array(test_a['end_date']).ravel()
    return None
selectFeatures([app_launch_logs, user_playback_data, user_portrait_data, test_a])
df.to_csv('features1.csv')

第三步:预处理特征数据集

numeric_df = df.dtypes[df.dtypes != 'object'].index
df[numeric_df] = df[numeric_df].apply(lambda x: (x - x.mean()) / (x.std()))
# 标准化后,每个特征的均值变为0,所以可以直接用0来填充
df[numeric_df] = df[numeric_df].fillna(0)
# 离散数值转成指示特征(dummy_na=True将缺失值也当作合法的特征值并为其创建指示特征)
df = pd.get_dummies(df, dummy_na=True)
df.shape
(15001, 18498)
df.to_csv('features2.csv')

第四步:数据集降维

import pandas as pd
import numpy as np
# 导入数据
data = pd.read_csv('features2.csv', index_col=0)
data.shape
(15001, 18498)
# 选取合适维数
from sklearn.decomposition import PCA
pca = PCA(n_components=0.99).fit_transform(data)
pca.shape
(15001, 12469)
pd.DataFrame(pca).to_csv('pcaFeatures.csv')

第五步:训练模型

# 分割数据集(X, y既作训练数据又作测试数据)
import numpy as np
import pandas as pd
sample = pd.read_csv('sample-a.csv', header=None)
y = np.array(sample.iloc[:,1])
data = pd.read_csv('pcaFeatures.csv', index_col=0)
pca = np.array(data)
import joblib
X = pca
# xgboost回归分析、调参
import xgboost as xgb
model_xgboost = xgb.XGBRegressor()
model_xgboost.fit(X, y)
joblib.dump(model_xgboost, 'model_xgboost.m')
from sklearn.ensemble import RandomForestRegressor
model_randomForestReg = RandomForestRegressor()
model_randomForestReg.fit(X, y)
joblib.dump(model_randomForestReg, 'model_randomForestReg.m')
from sklearn.ensemble import ExtraTreesRegressor
model_extraTreeReg = ExtraTreesRegressor()
model_extraTreeReg.fit(X, y)
joblib.dump(model_extraTreeReg, 'model_extraTreeReg.m')
from sklearn.tree import DecisionTreeRegressor
model_decisionTreeReg = DecisionTreeRegressor()
model_decisionTreeReg.fit(X, y)
joblib.dump(model_decisionTreeReg, 'model_decisionTreeReg.m')
from sklearn.neighbors import KNeighborsRegressor
model_knn = KNeighborsRegressor(weights="uniform")
model_knn.fit(X, y)
joblib.dump(model_knn, 'model_knn.m')

第六步:评价模型的好坏

# 导入pca
import pandas as pd
import numpy as np
import joblib
# 导入数据
data = pd.read_csv('pcaFeatures.csv', index_col=0)
pca = np.array(data)
# 查看数据维数
pca.shape
(15001, 12469)
# 回归真实标签
sample = pd.read_csv('sample-a.csv', header=None)
y = np.array(sample.iloc[:,1])
# 导入模型:初步选用xgboost,RandomForestRegression
model = joblib.load('model_xgboost.m')
# 定义评价函数
def gain(real, predict):
    grad = predict - real
    hess = 100 * (1- (np.abs(grad) / 7).mean())
    return hess

f ( F , A ) = 100 ⋅ ( 1 − 1 n ∑ i = 1 n ∣ F t − A t 7 ∣ ) f(F,A) = 100 \cdot (1 - \frac{1}{n} \sum_{i=1}^{n} |\frac{F_t - A_t}{7}|) f(F,A)=100(1n1i=1n7FtAt)

pred = model.predict(pca)
gain(y, pred)
92.28824197045672
# 处理数据
pred[pred<0] = 0
pred[pred>7] = 7
gain(y, pred)
92.28891224955667
from sklearn.metrics import r2_score
print(f'R Squared is: {r2_score(y, pred)}')

R Squared is: 0.8847543294909925

可以看到决定系数很高,说明模型预测效果比较好。

第七步:处理结果文件

# 写入文件
result = sample.copy()
result.iloc[:,1] = np.around(pred, decimals=2)
pd.DataFrame(result).to_csv('submission_xgb.csv', index=0, header=None)
  • 5
    点赞
  • 22
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 22
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 22
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

DeeGLMath

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值