思路分享
该比赛的数据主要围绕着用户和广告商的不同特征维度进行展开,特征工程的难点在于数据清洗部分,比如osv,version等字段,需要通过正则手段取出关键信息。除此之外,时间维度的展开存在多个共线性,需要斟酌拿取适合的时间信息。最后,target encoding是个万能的提分手段,分组聚合不同类别特征的label信息,总结出不同特征的欺诈分布。
模型
- catboost
- xgboost
- lightgbm(后面发现LGB的效果并没有XGB好,所以没采用)
- 五折+模型平均融合
代码展示
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import gc
#读取训练集和测试集
train_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/百度反欺诈预测/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/百度反欺诈预测/test1.csv')
#去掉unname
train_data = train_data.iloc[:,1:]
test_data = test_data.iloc[:,1:]
#查看数据集的数据情况,可以看出数据存在不少空值,而且某些特征值唯一
train_data
#把训练集和测试集拼接到一块,方便后续特征处理
data = pd.concat([train_data,test_data],axis = 0)
#打印一下数据的类型,可以看到存在某些object字段,需要进行数据清洗
data.info()
#查看缺失值分布,lan osv特征存在不少空值
data.isnull().sum()
#对osv特征进行数据清洗
def osv_trans(x):
x = str(x).replace('Android_','').replace('Android ','').replace('W','')
if str(x).find('.')>0:
temp_index1 = x.find('.')
if x.find(' ')>0:
temp_index2 = x.find(' ')
else:
temp_index2 = len(x)
if x.find('-')>0:
temp_index2 = x.find('-')
result = x[0:temp_index1] + '.' + x[temp_index1+1:temp_index2].replace('.','')
try:
return float(result)
except:
print(x+'########')
return 0
try:
return float(x)
except:
print(x+'########')
return 0
data['osv'].fillna(data['osv'].mode()[0],inplace = True)
data['osv'] = data['osv'].apply(osv_trans)
#对version特征进行数据清洗
def version_trans(x):
import re
reg = '\d+'
res = re.findall(reg,str(x))
return res[0]
data['version'] = data['version'].apply(version_trans).astype(int)
#取出fea_hash,fea1_hash的长度作为新特征
data['fea_hash_len'] = data['fea_hash'].apply(lambda x: len(x))
data['fea1_hash_len'] = data['fea1_hash'].apply(lambda x: len(str(x)))
#把长度>16的数据作为异常值,用0表示
data['fea_hash'] = data['fea_hash'].apply(lambda x: 0 if len(str(x))>16 else int(x))
data['fea1_hash'] = data['fea1_hash'].apply(lambda x: 0 if len(str(x))>16 else int(x))
#对timestamp进行数据清洗
from datetime import datetime
#划分时间多尺度,后面发现weekday,hour,minute存在一定的降分情况,暂时拿掉
data['timestamp'] = data['timestamp'].apply(lambda x:datetime.fromtimestamp(x/1000))
data['day'] = data['timestamp'].dt.day
# data['week_day'] = data['timestamp'].dt.weekday
# data['hour'] = data['timestamp'].dt.hour
# data['minute'] = data['timestamp'].dt.minute
#求时间diff,效果也不尽人意,暂时拿掉
# start = data['timestamp'].min()
# data['time_diff'] = data['timestamp'] - start
# data['time_diff'] = data['time_diff'].dt.days + data['time_diff'].dt.seconds/3600/24
#计算设备参数,利用设备原本的信息,构造新特征
data['dev_area'] = data['dev_height'] * data['dev_width']
data['size'] = (np.sqrt(data['dev_height']**2 + data['dev_width'] ** 2) / 2.54) / 1000
data['ratio'] = data['dev_height'] / data['dev_width']
data['px'] = data['dev_ppi'] * data['size']
#target encoding,不是所有特征都适合这样操作,需要后面做甄别
cols = ['package','location','cus_type','ntt','carrier','osv','apptype']
for col in tqdm(cols):
tmp = data[~data['label'].isnull()].groupby(col,as_index = False)['label'].agg({col+'_label_count':'count',col+'_label_sum':'sum',col+'_label_mean':'mean'})
data = data.merge(tmp,how = 'left',on = col)
all_cols = [i for i in data.columns if i not in ['timestamp','year','month','sid','os','label']]
#定义类别特征,用于catboost做特征交叉
cat_cols = ['android_id',
'apptype',
'carrier',
'dev_height',
'dev_ppi',
'dev_width',
'lan',
'media_id',
'ntt',
'osv',
'package',
'version',
'fea_hash',
'location',
'fea1_hash',
'cus_type',
'fea_hash_len',
'fea1_hash_len',
'day',
'dev_area',
'size',
'ratio',
'px'
]
#此处作Labelencoder处理,方便后续catboost做特征交叉处理
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for feat in cat_cols:
data[feat] = le.fit_transform(data[feat].astype(str))
import lightgbm as lgb
import xgboost as xgb
import catboost as ctb
from sklearn.model_selection import KFold
NUM_FLODS = 6
data = reduce_mem_usage(data)
train_data = data.iloc[:train_data.shape[0],:]
test_data = data.iloc[train_data.shape[0]:,:]
#五折KFold
final_res_list_clf_2 = np.zeros((NUM_FLODS,test_data.shape[0]))
final_res_list_clf_3 = np.zeros((NUM_FLODS,test_data.shape[0]))
kf = KFold(n_splits=NUM_FLODS)
for i, (train_index, val_index) in enumerate(kf.split(train_data)):
res_list = []
res = np.zeros((1,test_data.shape[0]))
X_train,y_train = train_data[all_cols].iloc[train_index], train_data['label'].iloc[train_index].astype(int)
X_val,y_val = train_data[all_cols].iloc[val_index], train_data['label'].iloc[val_index].astype(int)
#xgboost
clf_2 = xgb.XGBClassifier(max_depth=15, #书的深度
learning_rate=0.005, #学习率
n_estimators=2000, #子树的数量
objective='binary:logistic', #目标函数
tree_method='gpu_hist', #gpu模式
subsample=0.8, #训练样本采样率(行采样)
colsample_bytree=0.8, #训练样本采样率(列采样)
random_state=2021,#随机种子数
min_child_samples=3, #叶子节点最少样本数
reg_lambda=0.5,#L2正则化系数
gpu_hist = 'gpu',
eval_metric = 'logloss'
)
# catboost
clf_3 = ctb.CatBoostClassifier(iterations = 1100,learning_rate=0.05, max_depth=11, l2_leaf_reg=1,task_type='GPU',
cat_features=cat_cols,random_seed=2021,eval_metric='Logloss')
clf_2.fit(X_train, y_train, eval_set=[(X_val,y_val)],verbose = 500,early_stopping_rounds=10)
clf_3.fit(X_train, y_train, eval_set=[(X_val,y_val)],verbose = 500,early_stopping_rounds=10)
y_pred_test_clf_2 = clf_2.predict_proba(test_data[all_cols])[:,1]
y_pred_test_clf_3 = clf_3.predict_proba(test_data[all_cols])[:,1]
#保存每个模型每一个Fold的结果
final_res_list_clf_2[i] = y_pred_test_clf_2
final_res_list_clf_3[i] = y_pred_test_clf_3
del X_train,X_val,y_train,y_val,clf_2,clf_3
gc.collect()
print('='*100)
#融合每个模型所有Fold的结果
final_res_clf_2 = final_res_list_clf_2.mean(axis = 0)
final_res_clf_3 = final_res_list_clf_3.mean(axis = 0)
#融合每个模型交叉验证的结果
final_res = (final_res_clf_2 + final_res_clf_3)/2
result = test_data[['sid']]
final_res = final_res.reshape(-1,1)
result['label'] = final_res
result['label'] = result['label'].apply(lambda x:1 if x >0.5 else 0)
result.to_csv('/content/drive/MyDrive/Colab Notebooks/百度反欺诈预测/五折+模型融合.csv',index = False)