目录
❄️导入模块❄️
import joblib
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
warnings.filterwarnings("ignore")
🌜数据处理🌛
1️⃣加载数据集
%%time
# 加载数据集
train = pd.read_csv("historic customer behavior.csv")
test = pd.read_csv("target customer.csv")
print("train shape : ", train.shape)
print("test shape : ", test.shape)
2️⃣字符变量编码
# 输出 object 类型的列
for col in train.columns:
if train[col].dtype == "object":
print(col)
# LabelEncoder
for col in tqdm_notebook(train.columns):
if train[col].dtype == "object":
encoder = LabelEncoder()
encoder.fit(list(train[col].values) + list(test[col].values))
train[col] = encoder.transform(list(train[col].values))
test[col] = encoder.transform(list(test[col].values))
3️⃣分离数据集和标签
X = train.drop(['Risk_Flag'], axis=1)
y = train['Risk_Flag']
4️⃣超参数设置
# params 超参数设置
params = {'learning_rate':0.05, # 学习率,默认设置为0.1,一般设置在0.05-0.1之间
'boosting_type': 'gbdt', # 'dart', 'goss', 'rf'
'objective': 'binary', # 任务:二分类
'metric': 'auc', # 衡量标准
'max_depth': 8, # 默认-1 : 不限制深度
'num_leaves': 63, # 叶节点数量
'lambda_l1': 1,
'lambda_l2': 1,
'min_child_samples': 100,
'min_child_weight': 1,
'feature_fraction':1,
'bagging_fraction':1,
'bagging_freq':2,
"verbose" : -1, # 不显示信息
'seed': 66, # 随机种子
'is_unbalance':'True'
}
5️⃣保存特征重要性
# 创建DataFrame保存特征重要性
feature_importances = pd.DataFrame(index=None)
feature_importances['features'] = X.columns
🔞交叉验证训练模型🔞
# 5折交叉验证
folds = KFold(n_splits=5,shuffle=True,random_state=10)
splits = folds.split(X, y) # 分割成5份,前4份是训练集索引,最后1份是验证集索引
best_auc = 0
best_model = None
for fold, (train_indices, val_indices) in enumerate(splits):
print('fold ', fold + 1)
X_train_data, X_val_data = X.iloc[train_indices], X.iloc[val_indices] # 训练集, 验证集
y_train, y_val = y.iloc[train_indices], y.iloc[val_indices] # 训练标签,验证标签
train_dataset = lgb.Dataset(X_train_data, label=y_train) # 训练集
val_dataset = lgb.Dataset(X_val_data, label=y_val) # 验证集
lgb_model = lgb.train(params=params, # 超参数设置
train_set=train_dataset, # 训练数据
num_boost_round=1000, # 循环的轮数
valid_sets=val_dataset, # 验证数据
valid_names='validation', # 验证集名称
early_stopping_rounds=200,# 如果200轮后没有提升,就停止循环
verbose_eval=False)
# 保存特征重要性
feature_importances[f'fold_{fold+1}'] = lgb_model.feature_importance()
# 对验证集进行预测
y_val_pred = lgb_model.predict(X_val_data)
# 计算roc_auc
roc_auc = roc_auc_score(y_val, y_val_pred)
print(f" Fold {fold + 1} | AUC_ROC : { roc_auc * 100}%")
print(f"{'-'*50}\n")
# 判断是否是最优模型
if roc_auc > best_auc:
best_auc = roc_auc
best_model = lgb_model
1️⃣可视化特征重要性
feature_importances['average'] = feature_importances[[f'fold_{fold+1}' for fold in range(folds.n_splits-1)]].mean(axis=1)
# 可视化显示特征
plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importances.sort_values(by='average', ascending=False),
x='average',
y='features')
plt.title("top features importance over {} folds average.".format(folds.n_splits))
2️⃣模型预测
print("The best roc_auc : ", roc_auc)
# 对与test进行预测
pred = best_model.predict(test)
from collections import Counter
labels = np.round(pred) # np.round() 四舍五入
Counter(labels)