在我们还是一个小白时,我们往往会对比赛感到无从下手,对于别人分享的Baseline往往会感觉,哇,好腻害!那么问题来了,如何写出自己的Baseline呢?
填充众数,中位数,均值
#使用数据的均值、中位数或众数来填充缺失数据
def impute_NA_with_avg(data, strategy='mean', NA_col=[]):
"""
replacing the NA with mean/median/most frequent values of that variable.
Note it should only be performed over training set and then propagated to test set.
"""
data_copy = data.copy(deep=True)
for i in NA_col:
if data_copy[i].isnull().sum() > 0:
if strategy == 'mean':
data_copy[i + '_impute_mean'] = data_copy[i].fillna(data[i].mean())
elif strategy == 'median':
data_copy[i + '_impute_median'] = data_copy[i].fillna(data[i].median())
elif strategy == 'mode':
data_copy[i + '_impute_mode'] = data_copy[i].fillna(data[i].mode()[0])
else:
print("Column %s has no missing" % i)
return data_copy
从含有数字中的字符串中提取出数字
def workYearDIc(x):
if str(x) == 'nan':
return 0
x = x.replace('< 1', '0')
return int(re.search('(\d+)', x).group())
train_data['work_year'] = train_data['work_year'].map(workYearDIc)
exp:train_data
1 Nan
2 <1 year
3 10 years
字母到数字的转换,字典或者Series
class_dict = {
'A': 1,
'B': 2,
'C': 3,
'D': 4,
'E': 5,
'F': 6,
'G': 7,
}
dict_=df_train.groupby(col)[target_col].mean()
train_data['work_year'] = train_data['work_year'].map(workYearDIc)
目标编码,将字符串转成数字
cat_cols = ['employer_type', 'industry']
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
for col in cat_cols:
lbl = LabelEncoder().fit(train_data[col])
train_data[col] = lbl.transform(train_data[col])#train_data为train_data与train_inte之和
test_public[col] = lbl.transform(test_public[col])
train_inte[col] = lbl.transform(train_inte[col])
def encode_LE(col,train = train,test = test,verbose=True):#单独对train.test中的一列特征进行编码
data = pd.concat([train,test],sort = False)
lbl = LabelEncoder()
lbl.fit(list(data[col].values))
data[col] = lbl.transform(list(data[col].values))
train = data[0:len(train)]
test = data[len(train):]
return train,test
获取kmeans分类后的标签
clf = KMeans(5, random_state=2021)
pre = clf.fit(mmsModel)
test = pre.labels_
找出强特但是train/test中分布大的特征,采样时删除分布大的部分
#观测到在5到6.5左右,训练集的lmt出现了测试集中没有的峰值波动,于是考虑到正负样本的比例,在这里扔掉一部分负样本中对应的lmt部分,使lmt分布保持一致
train1 = train.loc[train.target==0]
train2 = train.loc[train.target==1]
a = train1['lmt'].between(5,5.2)
train1 = train1[~a]
a = train1['lmt'].between(6,6.2)
train1 = train1[~a]
train = pd.concat([train1,train2],axis=0,ignore_index=True)
计算PSI风控特征稳定性
import math
import numpy as np
import pandas as pd
def calculate_psi(base_list, test_list, bins=20, min_sample=10):
try:
base_df = pd.DataFrame(base_list, columns=['score'])
test_df = pd.DataFrame(test_list, columns=['score'])
# 1.去除缺失值后,统计两个分布的样本量
base_notnull_cnt = len(list(base_df['score'].dropna()))
test_notnull_cnt = len(list(test_df['score'].dropna()))
# 空分箱
base_null_cnt = len(base_df) - base_notnull_cnt
test_null_cnt = len(test_df) - test_notnull_cnt
# 2.最小分箱数
q_list = []
if type(bins) == int:
bin_num = min(bins, int(base_notnull_cnt / min_sample))
q_list = [x / bin_num for x in range(1, bin_num)]
break_list = []
for q in q_list:
bk = base_df['score'].quantile(q)
break_list.append(bk)
break_list = sorted(list(set(break_list))) # 去重复后排序
score_bin_list = [-np.inf] + break_list + [np.inf]
else:
score_bin_list = bins
# 4.统计各分箱内的样本量
base_cnt_list = [base_null_cnt]
test_cnt_list = [test_null_cnt]
bucket_list = ["MISSING"]
for i in range(len(score_bin_list)-1):
left = round(score_bin_list[i+0], 4)
right = round(score_bin_list[i+1], 4)
bucket_list.append("(" + str(left) + ',' + str(right) + ']')
base_cnt = base_df[(base_df.score > left) & (base_df.score <= right)].shape[0]
base_cnt_list.append(base_cnt)
test_cnt = test_df[(test_df.score > left) & (test_df.score <= right)].shape[0]
test_cnt_list.append(test_cnt)
# 5.汇总统计结果
stat_df = pd.DataFrame({"bucket": bucket_list, "base_cnt": base_cnt_list, "test_cnt": test_cnt_list})
stat_df['base_dist'] = stat_df['base_cnt'] / len(base_df)
stat_df['test_dist'] = stat_df['test_cnt'] / len(test_df)
def sub_psi(row):
# 6.计算PSI
base_list = row['base_dist']
test_dist = row['test_dist']
# 处理某分箱内样本量为0的情况
if base_list == 0 and test_dist == 0:
return 0
elif base_list == 0 and test_dist > 0:
base_list = 1 / base_notnull_cnt
elif base_list > 0 and test_dist == 0:
test_dist = 1 / test_notnull_cnt
return (test_dist - base_list) * np.log(test_dist / base_list)
stat_df['psi'] = stat_df.apply(lambda row: sub_psi(row), axis=1)
stat_df = stat_df[['bucket', 'base_cnt', 'base_dist', 'test_cnt', 'test_dist', 'psi']]
psi = stat_df['psi'].sum()
except:
print('error!!!')
psi = np.nan
stat_df = None
return psi, stat_df
for i in train.columns:
psi, stat_df = calculate_psi(base_list=list(train[i]),
test_list=list(x_test[i]),
bins=5, min_sample=50)
if psi>0.1:
print(i+' :'+str(psi) )
特征手动分箱
cutoff = [float('-inf'), 0, 1, 4, 10, 15, float('inf')]
dt['A_cut'] = pd.cut(dt['A'],bins=cutoff, right=False)
eg:
A_cut
0 [-inf,0)
1 [0,1)
再接类别编码转成数值类型
K折交叉验证,返回对训练集的预测,对测试集的预测,特征重要性
from sklearn.model_selection import StratifiedKFold KFold
def train_model(data_, test_, y_, folds_):
oof_preds = np.zeros(data_.shape[0])
sub_preds = np.zeros(test_.shape[0])
feature_importance_df = pd.DataFrame()
feats = [f for f in data_.columns if f not in ['loan_id', 'user_id', 'isDefault'] ]
for n_fold, (trn_idx, val_idx) in enumerate(folds_.split(data_)):
trn_x, trn_y = data_[feats].iloc[trn_idx], y_.iloc[trn_idx]
val_x, val_y = data_[feats].iloc[val_idx], y_.iloc[val_idx]
clf = LGBMClassifier(
n_estimators=4000,
learning_rate=0.03,
num_leaves=2**5,
colsample_bytree=.65,#0.65
subsample=.9,
max_depth=5,
# max_bin=250,
reg_alpha=1,#1.3
reg_lambda=.6,#0.3-89425 0.5-89423
min_split_gain=.02,#下面2.3,0.05的试过了
min_child_weight=1.9,#2.2-2.4,2.3
silent=-1,
verbose=-1,
)
print('第',n_fold+1,'次')
clf.fit(trn_x, trn_y,
eval_set= [(trn_x, trn_y), (val_x, val_y)],
eval_metric='auc', verbose=800, early_stopping_rounds=30 #30,400
)
oof_preds[val_idx] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)[:, 1]
sub_preds += clf.predict_proba(test_[feats], num_iteration=clf.best_iteration_)[:, 1] / folds_.n_splits
fold_importance_df = pd.DataFrame()
fold_importance_df["feature"] = feats
fold_importance_df["importance"] = clf.feature_importances_
fold_importance_df["fold"] = n_fold + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
del clf, trn_x, trn_y, val_x, val_y
gc.collect()
print('Full AUC score %.6f' % roc_auc_score(y_, oof_preds))
test_['y'] = sub_preds
return oof_preds, test_[['loan_id', 'y']], feature_importance_df
y = train_second['isDefault']
folds = KFold(n_splits=5, shuffle=True, random_state=546789)
oof_preds, test_preds, importances = train_model(train_second, test_second, y, folds)
特征重要性函数
import seaborn as sns
import matplotlib.pyplot as plt
def display_importances(feature_importance_df_):
# Plot feature importances
cols = feature_importance_df_[["feature", "importance"]]
.groupby("feature").mean().sort_values(by="importance", ascending=False)[:50].index
best_features=feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
plt.figure(figsize=(8,10))
sns.barplot(x="importance", y="feature",
data=best_features.sort_values(by="importance", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.savefig('lgbm_importances.png')