xgboost训练、评估与模型的保存、加载及使用

本文介绍了一种利用XGBoost进行信贷风险预测的方法。通过处理数据集中的缺失值并应用XGBoost训练模型,实现了对借款人未来两年内是否可能发生严重拖欠的预测。文中详细展示了模型训练过程、性能评估及如何保存和加载训练好的模型。
摘要由CSDN通过智能技术生成

记录一下xgboost训练与模型的保存、加载及使用

数据集和代码见文末

导入相关包

import numpy as np 
from sklearn.model_selection import train_test_split
import math
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import os
from sklearn.model_selection import GridSearchCV,StratifiedKFold
from sklearn.metrics import confusion_matrix,roc_auc_score,roc_curve
import sklearn.metrics as metrics
import xgboost as xgb

读取数据

df_data = pd.read_csv('/home/mw/input/data5700/cs-training.csv',encoding='GB18030',index_col=0)
df_data.head()
	SeriousDlqin2yrs	RevolvingUtilizationOfUnsecuredLines	age	NumberOfTime30-59DaysPastDueNotWorse	DebtRatio	MonthlyIncome	NumberOfOpenCreditLinesAndLoans	NumberOfTimes90DaysLate	NumberRealEstateLoansOrLines	NumberOfTime60-89DaysPastDueNotWorse	NumberOfDependents
1	1	0.76613	45	2	0.80298	9120.00000	13	0	6	0	2.00000
2	0	0.95715	40	0	0.12188	2600.00000	4	0	0	0	1.00000
3	0	0.65818	38	1	0.08511	3042.00000	2	1	0	0	0.00000
4	0	0.23381	30	0	0.03605	3300.00000	5	0	0	0	0.00000
5	0	0.90724	49	1	0.02493	63588.00000	7	0	1	0	0.00000

缺失值检查和处理

null_val_sums = df_data.isnull().sum()
pd.DataFrame({"特征名称": null_val_sums.index, "缺失值数目": null_val_sums.values,
             "缺失值占比": null_val_sums.values / len(df_data) })
特征名称	缺失值数目	缺失值占比
0	SeriousDlqin2yrs	0	0.00000
1	RevolvingUtilizationOfUnsecuredLines	0	0.00000
2	age	0	0.00000
3	NumberOfTime30-59DaysPastDueNotWorse	0	0.00000
4	DebtRatio	0	0.00000
5	MonthlyIncome	29731	0.19821
6	NumberOfOpenCreditLinesAndLoans	0	0.00000
7	NumberOfTimes90DaysLate	0	0.00000
8	NumberRealEstateLoansOrLines	0	0.00000
9	NumberOfTime60-89DaysPastDueNotWorse	0	0.00000
10	NumberOfDependents	3924	0.02616
for i in null_val_sums.index:
    df_data[i] = df_data[i].fillna(df_data[i].median())

df_data.isnull().sum()

SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64

划分数据集

feature_list = list(df_data)
del feature_list[0]
feature_list

单击部分隐藏输出,双击全隐藏
['RevolvingUtilizationOfUnsecuredLines',
 'age',
 'NumberOfTime30-59DaysPastDueNotWorse',
 'DebtRatio',
 'MonthlyIncome',
 'NumberOfOpenCreditLinesAndLoans',
 'NumberOfTimes90DaysLate',
 'NumberRealEstateLoansOrLines',
 'NumberOfTime60-89DaysPastDueNotWorse',
 'NumberOfDependents']
X_train_T, X_test, y_train_T, y_test = train_test_split(df_data[feature_list], df_data['SeriousDlqin2yrs'], test_size=0.2, random_state=42,stratify=df_data['SeriousDlqin2yrs'])
X_train, X_valid, y_train, y_valid = train_test_split(X_train_T[feature_list], y_train_T, test_size=0.2, random_state=42,stratify=y_train_T)

print('坏样本占比为: ',y_train.mean())

坏样本占比为: 0.06684375

模型训练

eval_set = [(X_valid, y_valid)]
model_xgb = xgb.XGBClassifier(random_state=2022,n_estimators=800).fit(X_train,y_train,early_stopping_rounds=100, eval_metric="auc",eval_set=eval_set)

模型评估

y_pred_train = model_xgb.predict_proba(X_train)[:,1]
fpr_xgb_train,tpr_xgb_train,_=metrics.roc_curve(y_train,y_pred_train)
roc_auc_train = metrics.auc(fpr_xgb_train, tpr_xgb_train)

y_pred = model_xgb.predict_proba(X_test)[:,1]
fpr_xgb_evl,tpr_xgb_evl,_=metrics.roc_curve(y_test,y_pred)
roc_auc_evl = metrics.auc(fpr_xgb_evl, tpr_xgb_evl)


plt.title('Receiver Operating Characteristic')
plt.plot(fpr_xgb_train, tpr_xgb_train, 'b', label = 'Train_AUC = %0.2f' % roc_auc_train)
plt.plot(fpr_xgb_evl, tpr_xgb_evl, 'y', label = 'evl_AUC = %0.2f' % roc_auc_evl)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

在这里插入图片描述

模型保存

import pickle #pickle模块
with open('model_xgb.pickle', 'wb') as f:
    pickle.dump(model_xgb, f)

模型加载

##模型读取
import pickle
with open('/home/mw/project/model_xgb.pickle', 'rb') as f:
   model = pickle.load(f)

数据加载

data = pd.read_csv('/home/mw/input/data5700/cs-training.csv')
data_features = data.iloc[:,2:]

预测函数编写

def pre_data(f1,f2,f3,f4,f5,f6,f7,f8,f9,f10):
    ##构建输入模型的数据格式
    data = {
        'RevolvingUtilizationOfUnsecuredLines':f1,
        'age':f2,
        'NumberOfTime30-59DaysPastDueNotWorse':f3,
        'DebtRatio':f4,
        'MonthlyIncome':f5,
        'NumberOfOpenCreditLinesAndLoans':f6,
        'NumberOfTimes90DaysLate':f7,
        'NumberRealEstateLoansOrLines':f8,
        'NumberOfTime60-89DaysPastDueNotWorse':f9,
        'NumberOfDependents':f10}
    p_data = pd.DataFrame([data])[['RevolvingUtilizationOfUnsecuredLines','age','NumberOfTime30-59DaysPastDueNotWorse',
 'DebtRatio','MonthlyIncome','NumberOfOpenCreditLinesAndLoans','NumberOfTimes90DaysLate','NumberRealEstateLoansOrLines',
 'NumberOfTime60-89DaysPastDueNotWorse','NumberOfDependents']]
    ##模型预测
    pre = model.predict(p_data)
    r= pre[0]
    ##获取预测结果
    # r = np.argmax(pre,axis=1)[0]
    ##预测结果转换
    if r == 0:
        res = '不存在逾期'
    else:
        res = '存在逾期'
    return res

单条预测

print(pre_data(0.02566,38,0,0.47584,3000.00000,7,0,1,0,2.00000))

不存在逾期

批量预测

for i in range(500):
    a = list(data_features.iloc[i,:])
    print(pre_data(a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7],a[8],a[9]))

在这里插入图片描述

数据集和代码

数据集

点击下载数据

代码

点击下载代码

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

爱挠静香的下巴

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值