文章目录
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/MyDrive/chinese task/讯飞-电信用户流失')
Mounted at /content/drive
参考:
- 《2021科大讯飞-车辆贷款违约预测挑战赛 Top1方案》
- 《数据挖掘-租金预测》
-<WSDM-爱奇艺:用户留存预测挑战赛 线上0.865> - <微信大数据挑战赛 亚军方案–如何用baseline上724+>
- <用户购买预测比赛第十名方案>
!pip install unzip
!unzip '/content/drive/MyDrive/chinese task/讯飞-电信用户流失/电信客户流失预测挑战赛数据集.zip'n
读取数据集:
import pandas as pd
train= pd.read_csv('./train.csv');
test=pd.read_csv('./test.csv')
train
客户ID | 地理区域 | 是否双频 | 是否翻新机 | 当前手机价格 | 手机网络功能 | 婚姻状况 | 家庭成人人数 | 信息库匹配 | 预计收入 | ... | 客户生命周期内平均月费用 | 客户生命周期内的平均每月使用分钟数 | 客户整个生命周期内的平均每月通话次数 | 过去三个月的平均每月使用分钟数 | 过去三个月的平均每月通话次数 | 过去三个月的平均月费用 | 过去六个月的平均每月使用分钟数 | 过去六个月的平均每月通话次数 | 过去六个月的平均月费用 | 是否流失 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 7 | 0 | -1 | 181 | 0 | 2 | 0 | 0 | 3 | ... | 24 | 286 | 91 | 351 | 121 | 23 | 303 | 101 | 25 | 0 |
1 | 1 | 13 | 1 | 0 | 1399 | 0 | 3 | 0 | 0 | 0 | ... | 44 | 447 | 190 | 483 | 199 | 40 | 488 | 202 | 44 | 1 |
2 | 2 | 14 | 1 | 0 | 927 | 0 | 2 | 4 | 0 | 6 | ... | 48 | 183 | 79 | 271 | 95 | 71 | 209 | 77 | 54 | 0 |
3 | 3 | 1 | 0 | 0 | 232 | 0 | 3 | -1 | 1 | -1 | ... | 42 | 303 | 166 | 473 | 226 | 72 | 446 | 219 | 65 | 1 |
4 | 4 | 0 | -1 | 0 | 699 | 0 | 1 | 2 | 0 | 3 | ... | 36 | 119 | 24 | 88 | 15 | 35 | 106 | 21 | 37 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
149995 | 149995 | 10 | 1 | 0 | 1350 | 0 | 3 | 0 | 0 | 0 | ... | 156 | 474 | 160 | 239 | 80 | 74 | 346 | 122 | 83 | 1 |
149996 | 149996 | 6 | 1 | 0 | 542 | 0 | 3 | -1 | 1 | -1 | ... | 52 | 968 | 208 | 1158 | 257 | 58 | 1307 | 261 | 57 | 0 |
149997 | 149997 | 15 | 1 | 0 | 1300 | 0 | 1 | 2 | 0 | 6 | ... | 39 | 504 | 205 | 544 | 203 | 45 | 531 | 205 | 47 | 1 |
149998 | 149998 | 12 | 1 | 0 | 1399 | 0 | 4 | 1 | 0 | -1 | ... | 91 | 685 | 249 | 233 | 140 | 94 | 432 | 236 | 97 | 1 |
149999 | 149999 | 10 | 1 | 0 | 1049 | 0 | 3 | -1 | 1 | -1 | ... | 37 | 177 | 80 | 147 | 59 | 35 | 167 | 74 | 34 | 0 |
150000 rows × 69 columns
一、查看各字段中分布情况
train['是否流失'].value_counts()#查看正负样本数
#查看是否有缺失值
missing_counts = pd.DataFrame(train.isnull().sum())
missing_counts.columns = ['count_null']
missing_counts.describe()
#查看各字段数据类型
for col in train.columns:
print(f'{
col} \t {
train.dtypes[col]} {
train[col].nunique()}')
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
import time
from lightgbm import LGBMClassifier
import lightgbm as lgb
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
%matplotlib inline
import warnings
warnings.simplefilter('ignore', UserWarning)
import gc
gc.enable()
import time
1.2 使用pandas_profiling自动分析数据
参考:
conda install -c conda-forge pandas-profiling
#!pip install -U pandas-profiling[notebook]
#安装之后要重启kernal
import pandas as pd
import pandas_profiling
data = pd.read_csv('./train.csv')
profile = data.profile_report(title='Pandas Profiling Report')
profile.to_file(output_file="telecom_customers_pandas_profiling.html")
查看Pandas Profiling Report发现:
- 类别特征:‘地理区域’,‘是否双频’,‘是否翻新机’,‘手机网络功能’,‘婚姻状况’,‘家庭成人人数’,‘信息库匹配’,‘信用卡指示器’,‘新手机用户’,‘账户消费限额’
- 分箱特征有:‘预计收入’,
- 异常值特征:‘家庭中唯一订阅者的数量’,‘家庭活跃用户数’,
- 无用(数据不平衡)特征:‘平均呼叫转移呼叫数’,‘平均丢弃数据呼叫数’,[149797,148912]
#区分数值特征和类别特征
features=list(train.columns)
categorical_features =['地理区域','是否双频','是否翻新机','手机网络功能','婚姻状况','预计收入',
'家庭成人人数','信息库匹配','信用卡指示器','新手机用户','账户消费限额']
numeric_features =[item for item in features if item not in categorical_features]
numeric_features=[i for i in numeric_features if i not in ['客户ID','是否流失']]
#多类别和少类别
categorical_features1 =['是否双频','是否翻新机','手机网络功能','信息库匹配','信用卡指示器','新手机用户','账户消费限额']
categorical_features2 =['地理区域','婚姻状况','预计收入','家庭成人人数']
#处理几个异常值
#train[train['家庭中唯一订阅者的数量'].values > 13]=14
#通过查看Pandas Profiling Report,发现以下列类别不平衡,打印出来看看情况
#还有一些异常值暂时没处理
cols=['家庭中唯一订阅者的数量','家庭活跃用户数','数据超载的平均费用','平均漫游呼叫数','平均丢弃数据呼叫数','平均占线数据调用次数',
'未应答数据呼叫的平均次数','尝试数据调用的平均数','完成数据调用的平均数','平均三通电话数','平均峰值数据调用次数',
'非高峰数据呼叫的平均数量','平均呼叫转移呼叫数']
for i in cols:
print(train[i].value_counts())
- lr=0.2时roc=0.84479;0.3时0.8379,;lr=0.15时0.84578
- ‘num_leaves’,30改为45时,0.8468
这样调没啥用啊
#以下特征99.5%都是一种数值,可以考虑删掉。[149797,149493,149218,148912]
#lr=0.2时roc=0.84479
null_clos=['平均呼叫转移呼叫数','平均占线数据调用次数','未应答数据呼叫的平均次数','平均丢弃数据呼叫数']
for i in null_clos:
del train[i]
del test[i]
train
二、 使用baseline参数训练
- 全部特征跑10931轮,valid_acc=0.84298
- null importance跑5000轮:
- 选取split_feats大于0的特征(43种)可跑14402轮,valid_acc=0.83887
- 选取feats大于0的特征(23种)可跑10946轮,valid_acc=0.8193
- null importance跑1000轮:
- 选取split_feats大于0的特征(66种)可跑11817轮,valid_acc=0.84417
- 选取feats大于0的特征(58种)可跑11725轮,valid_acc=0.84345
from sklearn.model_selection import train_test_split
# 划分训练集和测试集
X_train,X_test,y_train,y_test=train_test_split(train.drop(labels=['客户ID','是否流失'],axis=1),train['是否流失'],random_state=10,test_size=0.2)
imp_df = pd.DataFrame()
lgb_train = lgb.Dataset(X_train, y_train,free_raw_data=False,silent=True)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,free_raw_data=False,
silent=True)
lgb_params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'min_child_weight': 5,
'num_leaves': 2 ** 5,
'lambda_l2': 10,
'feature_fraction': 0.7,
'bagging_fraction': 0.7,
'bagging_freq': 10,
'learning_rate': 0.2,
'seed': 2022,
'n_jobs':-1}
# 训练5000轮,每300轮报告一次acc,200轮没有提升就停止训练
clf = lgb.train(params=lgb_params,train_set=lgb_train,valid_sets=lgb_eval,
num_boost_round=50000,verbose_eval=300,early_stopping_rounds=200)
roc= roc_auc_score(y_test, clf.predict( X_test))
y_pred=[1 if x >0.5 else 0 for x in clf.predict(X_test)]
acc=accuracy_score(y_test,y_pred)
Training until validation scores don't improve for 200 rounds.
[300] valid_0's auc: 0.733101
[600] valid_0's auc: 0.754127
[900] valid_0's auc: 0.766728
[1200] valid_0's auc: 0.777367
[1500] valid_0's auc: 0.78594
[1800] valid_0's auc: 0.792209
[2100] valid_0's auc: 0.798424
[2400] valid_0's auc: 0.80417
[2700] valid_0's auc: 0.808074
[3000] valid_0's auc: 0.811665
[3300] valid_0's auc: 0.814679
[3600] valid_0's auc: 0.817462
[3900] valid_0's auc: 0.820151
[4200] valid_0's auc: 0.822135
[4500] valid_0's auc: 0.824544
[4800] valid_0's auc: 0.825994
Did not meet early stopping. Best iteration is:
[4994] valid_0's auc: 0.826797
roc,acc
(0.8267972007033084, 0.7533)
三、Null Importances进行特征选择
def get_feature_importances(X_train, X_test, y_train, y_test,shuffle, seed=None):
# 获取特征
train_features = list(X_train.columns)
# 判断是否shuffle TARGET
y_train,y_test= y_train.copy(),y_test.copy()
if shuffle:
# Here you could as well use a binomial distribution
y_train,y_test= y_train.copy().sample(frac=1.0),y_test.copy().sample(frac=1.0)
lgb_train = lgb.Dataset(X_train, y_train,free_raw_data=False,silent=True)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,free_raw_data=False,silent=True)
# 在 RF 模式下安装 LightGBM,它比 sklearn RandomForest 更快
lgb_params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'min_child_weight': 5,
'num_leaves': 2 ** 5,
'lambda_l2': 10,
'feature_fraction': 0.7,
'bagging_fraction': 0.7,
'bagging_freq': 10,
'learning_rate': 0.2,
'seed': 2022,
'n_jobs':-1}
# 训练模型
clf = lgb.train(params=lgb_params,train_set=lgb_train,valid_sets=lgb_eval,
num_boost_round=500,verbose_eval=50,early_stopping_rounds=30)
#得到特征重要性
imp_df = pd.DataFrame()
imp_df["feature"] = list(train_features)
imp_df["importance_gain"] = clf.feature_importance(importance_type='gain')
imp_df["importance_split"] = clf.feature_importance(importance_type='split')
imp_df['trn_score'] = roc_auc_score(y_test, clf.predict( X_test))
return imp_df
np.random.seed(123)
# 获得市实际的特征重要性,即没有shuffletarget
actual_imp_df = get_feature_importances(X_train, X_test, y_train, y_test, shuffle=False)
actual_imp_df
Training until validation scores don't improve for 20 rounds.
[30] valid_0's auc: 0.695549
[60] valid_0's auc: 0.704629
[90] valid_0's auc: 0.711638
[120] valid_0's auc: 0.715182
[150] valid_0's auc: 0.718961
[180] valid_0's auc: 0.722121
[210] valid_0's auc: 0.725615
[240] valid_0's auc: 0.728251
[270] valid_0's auc: 0.730962
[300] valid_0's auc: 0.733101
[330] valid_0's auc: 0.73578
[360] valid_0's auc: 0.73886
[390] valid_0's auc: 0.741238
[420] valid_0's auc: 0.742486
[450] valid_0's auc: 0.744295
[480] valid_0's auc: 0.746555
Did not meet early stopping. Best iteration is:
[495] valid_0's auc: 0.747792
feature | importance_gain | importance_split | trn_score | |
---|---|---|---|---|
0 | 地理区域 | 1956.600422 | 313 | 0.747792 |
1 | 是否双频 | 442.401141 | 62 | 0.747792 |
2 | 是否翻新机 | 269.466828 | 26 | 0.747792 |
3 | 当前手机价格 | 3838.696197 | 365 | 0.747792 |
4 | 手机网络功能 | 750.396258 | 51 | 0.747792 |
... | ... | ... | ... | ... |
62 | 过去三个月的平均每月通话次数 | 2540.721027 | 325 | 0.747792 |
63 | 过去三个月的平均月费用 | 2098.813867 | 304 | 0.747792 |
64 | 过去六个月的平均每月使用分钟数 | 2375.925741 | 337 | 0.747792 |
65 | 过去六个月的平均每月通话次数 | 2541.735172 | 346 | 0.747792 |
66 | 过去六个月的平均月费用 | 2103.062207 | 313 | 0.747792 |
67 rows × 4 columns
<svg xmlns=“http://www.w3.org/2000/svg” height="24px"viewBox=“0 0 24 24”
width=“24px”>
null_imp_df = pd.DataFrame()
nb_runs = 10
import time
start = time.time()
dsp = ''
for i in range(nb_runs):
# 获取当前的特征重要性
imp_df = get_feature_importances(X_train, X_test, y_train, y_test, shuffle=True)
imp_df['run'] = i + 1
# 将特征重要性连起来
null_imp_df = pd.concat([null_imp_df, imp_df], axis=0)
# 删除上一条信息
for l in range(len(dsp)):
print('\b', end='', flush=True)
# Display current run and time used
spent = (time.time() - start) / 60
dsp = 'Done with %4d of %4d (Spent %5.1f min)' % (i + 1, nb_runs, spent)
print(dsp, end='', flush=True)
null_imp_df
feature | importance_gain | importance_split | trn_score | run | |
---|---|---|---|---|---|
0 | 地理区域 | 38.622730 | 5 | 0.505320 | 1 |
1 | 是否双频 | 0.000000 | 0 | 0.505320 | 1 |
2 | 是否翻新机 | 0.000000 | 0 | 0.505320 | 1 |
3 | 当前手机价格 | 30.980300 | 4 | 0.505320 | 1 |
4 | 手机网络功能 | 0.000000 | 0 | 0.505320 | 1 |
... | ... | ... | ... | ... | ... |
62 | 过去三个月的平均每月通话次数 | 109.945481 | 14 | 0.503911 | 10 |
63 | 过去三个月的平均月费用 | 35.344621 | 4 | 0.503911 | 10 |
64 | 过去六个月的平均每月使用分钟数 | 55.200380 | 7 | 0.503911 | 10 |
65 | 过去六个月的平均每月通话次数 | 53.439080 | 6 | 0.503911 | 10 |
66 | 过去六个月的平均月费用 | 47.455200 | 6 | 0.503911 | 10 |
670 rows × 5 columns
<svg xmlns=“http://www.w3.org/2000/svg” height="24px"viewBox=“0 0 24 24”
width=“24px”>
def display_distributions(actual_imp_df_, null_imp_df_, feature_):
plt.figure(figsize=