方案
该开源方案在初赛A榜成绩为0.948,初赛B榜为0.940。仅为大家开阔思路使用,希望对大家有一定的帮助。
任务描述
防羊毛党评分模型旨在从普通用户中区分出羊毛党用户号码。
开源代码
这里以初赛A榜的处理为例
第一步是处理一下数据格式问题
import pandas as pd
import numpy as np
data = pd.read_csv("data/data_a.csv")
label = pd.read_csv("data/train_label.csv")
to_predict_a = pd.read_csv("data/to_pred_a.csv")
# 数据中存在'\N',替换为nan
data = data.sort_values(by = ["phone","month"]).reset_index(drop = True)
data = data.replace(r"\N",np.nan)
# 转化一下数据格式
data["if_family"] = data["if_family"].astype(float)
data["if_group"] = data["if_group"].astype(float)
data["gprs_fee"] = data["gprs_fee"].astype(float)
data["overrun_flux_fee"] = data["overrun_flux_fee"].astype(float)
data["out_actvcall_dur"] = data["out_actvcall_dur"].astype(float)
data["actvcall_fee"] = data["actvcall_fee"].astype(float)
data["out_activcall_fee"] = data["out_activcall_fee"].astype(float)
data["monfix_fee"] = data["monfix_fee"].astype(float)
data["gift_acct_amt"] = data["gift_acct_amt"].astype(float)
data["call_cnt"] = data["call_cnt"].astype(float)
data["up_flux"] = data["up_flux"].astype(float)
data["down_flux"] = data["down_flux"].astype(float)
data["sms_inpkg_ind"] = data["sms_inpkg_ind"].astype(float)
data["p2psms_up_cnt"] = data["p2psms_up_cnt"].astype(float)
data["p2psms_cmnct_fee"] = data["p2psms_cmnct_fee"].astype(float)
data["p2psms_pkg_fee"] = data["p2psms_pkg_fee"].astype(float)
第二步的思想是将训练集1,2月份的数据横向的拼接起来,测试机的3,4月份数据横向的拼接起来,然后列名需要重新命名
data_1 = data[data["month"].isin([202001])].reset_index(drop=True)
data_2 = data[data["month"].isin([202002])].reset_index(drop=True)
data_3 = data[data["month"].isin([202003])].reset_index(drop=True)
data_4 = data[data["month"].isin([202004])].reset_index(drop=True)
col1 = data_1.columns
col1 = [col+"_b" for col in col1 if col not in ["phone","month"]]
data_1.columns = ["phone","month"] + col1
col2 = data_2.columns
col2 = [col+"_f" for col in col2 if col not in ["phone","month"]]
data_2.columns = ["phone","month"] + col2
data_1_2 = data_1[["phone"]+col1].merge(data_2[["phone"]+col2],on = "phone",how = "left").reset_index(drop=True)
col3 = data_3.columns
col3 = [col+"_b" for col in col3 if col not in ["phone","month"]]
data_3.columns = ["phone","month"] + col3
col4 = data_4.columns
col4 = [col+"_f" for col in col4 if col not in ["phone","month"]]
data_4.columns = ["phone","month"] + col4
data_3_4 = data_3[["phone"]+col3].merge(data_4[["phone"]+col4],on = "phone",how = "left").reset_index(drop=True)
data = pd.concat([data_1_2,data_3_4],axis=0).reset_index(drop=True)
data = data.merge(label,on="phone",how="left")
data.to_csv("data/merge_data.csv",index=False)
第三步就是建模过程,主要的特征就是缺失率、目标编码、以及不同月份之间的变化率、离散特征是否变化这些特征
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
data_1 = pd.read_csv("data/merge_data.csv")
data_1['missing_rate'] = (data_1.shape[1] - data_1.count(axis = 1)) / data_1.shape[1]
cate_cols = ['if_group_b','if_family_b','sms_inpkg_ind_b','if_group_f','if_family_f','sms_inpkg_ind_f']
dense_cols = [col for col in data_1.columns if col not in cate_cols+["phone","label"]]
dense = ['chrg_cnt','chrg_amt','gprs_fee','overrun_flux_fee','out_actvcall_dur','actvcall_fee',
'out_activcall_fee','monfix_fee','gift_acct_amt','call_cnt','up_flux','down_flux',
'p2psms_up_cnt','p2psms_cmnct_fee','p2psms_pkg_fee']
for col in dense:
data_1[col+"_diff"] = data_1[col+"_f"] - data_1[col+"_b"]
data_1[col+"_ratio"] = (data_1[col+"_f"] - data_1[col+"_b"]) / (data_1[col+"_b"]+1e-7)
for col in ['if_group','if_family','sms_inpkg_ind'] + dense:
data_1[col+"_ifchange"] = (data_1[col+"_b"]==data_1[col+"_f"]).astype(int)
for col in ['if_group','if_family','sms_inpkg_ind'] + dense:
data_1[col+"_ifchange"] = (data_1[col+"_b"]==data_1[col+"_f"]).astype(int)
test_df = data_1[data_1["label"].isnull() == True].copy().reset_index(drop=True)
train_df = data_1[~data_1["label"].isnull() == True].copy().reset_index(drop=True)
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)
enc_list = cate_cols+dense_cols
for f in tqdm(enc_list):
train_df[f + '_target_enc'] = 0
test_df[f + '_target_enc'] = 0
for i, (trn_idx, val_idx) in enumerate(skf.split(train_df, train_df['label'])):
trn_x = train_df[[f, 'label']].iloc[trn_idx].reset_index(drop=True)
val_x = train_df[[f]].iloc[val_idx].reset_index(drop=True)
enc_df = trn_x.groupby(f, as_index=False)['label'].agg({f + '_target_enc': 'mean'})
val_x = val_x.merge(enc_df, on=f, how='left')
test_x = test_df[[f]].merge(enc_df, on=f, how='left')
val_x[f + '_target_enc'] = val_x[f + '_target_enc'].fillna(train_df['label'].mean())
test_x[f + '_target_enc'] = test_x[f + '_target_enc'].fillna(train_df['label'].mean())
train_df.loc[val_idx, f + '_target_enc'] = val_x[f + '_target_enc'].values
test_df[f + '_target_enc'] += test_x[f + '_target_enc'].values / skf.n_splits
train_df.shape,test_df.shape
import lightgbm as lgb
from tensorflow import keras
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold,train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import entropy
import tensorflow as tf
import gc
import os
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau,LambdaCallback
from sklearn.metrics import roc_auc_score
remove_col = []
for col in train_df.columns:
if col not in ["phone", "label"]:
if train_df[col].nunique() < 2:
remove_col.append(col)
feature_name = [i for i in train_df.columns if i not in ["phone", "label"]+remove_col]
print(len(feature_name))
X_train = train_df[feature_name].reset_index(drop=True)
X_test = test_df[feature_name].reset_index(drop=True)
y = train_df['label'].reset_index(drop=True).astype(int)
oof = np.zeros(X_train.shape[0])
prediction = np.zeros(X_test.shape[0])
seeds = [5, 4, 2, 1015, 2020, 2048, 1024]
num_model_seed = len(seeds)
print("Catboost训练")
for model_seed in range(num_model_seed):
print('--'*20+str(seeds[model_seed])+'--'*20)
oof_cat = np.zeros(X_train.shape[0])
prediction_cat = np.zeros(X_test.shape[0])
skf = StratifiedKFold(n_splits=5, random_state=seeds[model_seed], shuffle=True)
for index, (train_index, test_index) in enumerate(skf.split(X_train, y)):
print(index)
train_x, test_x, train_y, test_y = X_train[feature_name].iloc[train_index], X_train[feature_name].iloc[test_index], y.iloc[train_index], y.iloc[test_index]
cate_fea=[]
clf = CatBoostClassifier(iterations=10000, depth=8,learning_rate=0.1, loss_function='Logloss',cat_features=cate_fea
,verbose=True,eval_metric='AUC',counter_calc_method='Full',task_type='GPU',metric_period=50)
clf.fit(
train_x, train_y,
eval_set=[(test_x,test_y)],
early_stopping_rounds=200,
verbose=True,
use_best_model=True,
)
oof_cat[test_index] = clf.predict_proba(test_x)[:, 1]
prediction_cat += clf.predict_proba(X_test)[:, 1] / 5
del clf
oof += oof_cat / num_model_seed
prediction += prediction_cat / num_model_seed
写在最后
主要的上分点:
-
特征筛选
-
CatBoost比LightGBM要好
-
连续特征的不同月份的变化率特征
关于初赛B榜的处理,我个人的做法是将初赛b榜的数据与初赛a榜的数据合并到一起,然后处理与上述的程序一致,只不过在最后读取预测概率的时候注意是初赛b榜的数据。
欢迎关注公众号ChallengeHub!!!欢迎加入ChallengeHub学习交流群,微信群扫描下方管理员微信进入哦,qq群扫描下方qq二维码进入!!!