金融风控入门赛05

最新推荐文章于 2024-04-07 17:52:53 发布

尼尔-冯-哈尔滨

最新推荐文章于 2024-04-07 17:52:53 发布

阅读量113

点赞数

分类专栏：我的博客文章标签：机器学习 python

本文链接：https://blog.csdn.net/m0_37671786/article/details/108834865

版权

我的博客专栏收录该内容

41 篇文章 2 订阅

订阅专栏

金融风控入门赛05

这次的风控入门赛终于进入到最后一次打卡了，这是我第二次参加DataWhale组队学习了。其实关于风控的相关内容自己以前也多多少少的学习过，不过这次的组队学习感觉像一次练兵，补足了我往常学习中的不足之处。好了，废话少说，下面进入正题：

1.模型融合

在机器学习模型工作比赛中，有时常常会遇到单一模型预测指标存在上限的情况，那么在这种情况下提升机器学习模型准确度的方法除了特征工程和参数调节之外，可用的方法就是模型融合了！
模型融合顾名思义，就是将多个模型预测的结果汇总得到最终的预测结果。其实，在机器学习模型中有些模型就存在模型融和的相关运用，如Boosting模型就是模型融合的一种，如XGBoosting模型就是利用了提升树将多个决策树的结果汇总，得到最终预测结果的一种方法。不过这次所提到的模型融合是指对于不同模型类型的融合，其中对于金融风控的场景，线性模型和树模型都可以使用，可采用多种不同的机器学习模型进行预测。

模型融合的常用方法

平均：

简单平均法
加权平均法

投票：

简单投票法
加权投票法

综合：

排序融合
log融合

stacking:

构建多层模型，并利用预测结果再拟合预测。

blending：

选取部分数据预测训练得到预测结果作为新特征，带入剩下的数据中预测。

2.代码示例

数据读取

import pandas as pd
import numpy as np
import warnings
import os
import seaborn as sns
import matplotlib.pyplot as plt
"""
sns 相关设置
@return:
"""
# 声明使用 Seaborn 样式
sns.set()
# 有五种seaborn的绘图风格，它们分别是：darkgrid, whitegrid, dark, white, ticks。默认的主题是darkgrid。
sns.set_style("whitegrid")
# 有四个预置的环境，按大小从小到大排列分别为：paper, notebook, talk, poster。其中，notebook是默认的。
sns.set_context('talk')
# 中文字体设置-黑体
plt.rcParams['font.sans-serif'] = ['SimHei']
# 解决保存图像是负号'-'显示为方块的问题
plt.rcParams['axes.unicode_minus'] = False
# 解决Seaborn中文显示问题并调整字体大小
sns.set(font='SimHei')

def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

# 读取数据
X_train = pd.read_csv('../data/x_train.csv',index_col=0)
X_test = pd.read_csv('../data/x_test.csv',index_col=0)
y_train = pd.read_csv('../data/y_train.csv',index_col=0)
X_train = reduce_mem_usage(X_train)
X_test = reduce_mem_usage(X_test)

#处理test不和格式数据
short_list = X_test[X_test.isnull().values==True].columns.tolist()
#按照众数填充类别型特征
X_test[short_list] = X_test[short_list].fillna(X_test[short_list].mode())
X_test=X_test.dropna(axis=0)
#处理inf值
train_inf = np.isinf(X_test)
X_test[train_inf] = 0

这里需要注意的是x_test数据中含有空值和inf值，如果要进行预测需处理后再使用。

投票法

#简单投票
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = XGBClassifier(learning_rate=0.1, n_estimators=150, max_depth=4, min_child_weight=2, subsample=0.7,objective='binary:logistic')
 
vclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('xgb', clf3)])
vclf = vclf .fit(X_train,y_train)
print(vclf .predict(X_test))

加权投票

#- 加权投票
#在VotingClassifier中加入参数 voting='soft', weights=[2, 1, 1]，weights用于调节基模型的权重
#加权投票
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = XGBClassifier(learning_rate=0.1, n_estimators=150, max_depth=4, min_child_weight=2, subsample=0.7,objective='binary:logistic')
 
vclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('xgb', clf3)], voting='soft', weights=[1, 1, 2])
vclf = vclf .fit(X_train,y_train)
print(vclf .predict(X_test))

Stacking

#stacking
import warnings
warnings.filterwarnings('ignore')
import itertools
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from mlxtend.plotting import plot_learning_curves
from mlxtend.plotting import plot_decision_regions


X, y = X_train[:30000],y_train[:30000]


clf1 = KNeighborsClassifier(n_neighbors=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], 
                          meta_classifier=lr)


label = ['KNN', 'Random Forest', 'Naive Bayes', 'Stacking Classifier']
clf_list = [clf1, clf2, clf3, sclf]
    
#fig = plt.figure(figsize=(10,8))
#gs = gridspec.GridSpec(2, 2)
#grid = itertools.product([0,1],repeat=2)


clf_cv_mean = []
clf_cv_std = []
for clf, label, grd in zip(clf_list, label, grid):
        
    scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
    print("Accuracy: %.2f (+/- %.2f) [%s]" %(scores.mean(), scores.std(), label))
    clf_cv_mean.append(scores.mean())
    clf_cv_std.append(scores.std())
        
    clf.fit(X, y)
    #ax = plt.subplot(gs[grd[0], grd[1]])
#     fig = plot_decision_regions(X=X, y=y, clf=clf)
#     plt.title(label)
 

#plt.show()

blending

#blending
# 以python自带的鸢尾花数据集为例
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

data = X_train[:10000]

target = y_train[:10000]
 
#模型融合中基学习器
clfs = [LogisticRegression(),
        RandomForestClassifier(),
        ExtraTreesClassifier(),
        GradientBoostingClassifier()]
 
#切分一部分数据作为测试集
X, X_predict, y, y_predict = train_test_split(data, target, test_size=0.3, random_state=914)


#切分训练数据集为d1,d2两部分
X_d1, X_d2, y_d1, y_d2 = train_test_split(X, y, test_size=0.5, random_state=914)
dataset_d1 = np.zeros((X_d2.shape[0], len(clfs)))
dataset_d2 = np.zeros((X_predict.shape[0], len(clfs)))
 
for j, clf in enumerate(clfs):
    #依次训练各个单模型
    clf.fit(X_d1, y_d1)
    y_submission = clf.predict_proba(X_d2)[:, 1]
    dataset_d1[:, j] = y_submission
    #对于测试集，直接用这k个模型的预测值作为新的特征。
    dataset_d2[:, j] = clf.predict_proba(X_predict)[:, 1]
    print("val auc Score: %f" % roc_auc_score(y_predict, dataset_d2[:, j]))


#融合使用的模型
clf = GradientBoostingClassifier()
clf.fit(dataset_d1, y_d2)
y_submission = clf.predict_proba(dataset_d2)[:, 1]
print("Val auc Score of Blending: %f" % (roc_auc_score(y_predict, y_submission)))

尼尔-冯-哈尔滨

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
金融风控入门赛05

金融风控入门赛05这次的风控入门赛终于进入到最后一次打卡了，这是我第二次参加DataWhale组队学习了。其实关于风控的相关内容自己以前也多多少少的学习过，不过这次的组队学习感觉像一次练兵，补足了我往常学习中的不足之处。好了，废话少说，下面进入正题：...
复制链接

扫一扫