记录一下xgboost训练与模型的保存、加载及使用
数据集和代码见文末
导入相关包
import numpy as np
from sklearn.model_selection import train_test_split
import math
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
from sklearn.model_selection import GridSearchCV,StratifiedKFold
from sklearn.metrics import confusion_matrix,roc_auc_score,roc_curve
import sklearn.metrics as metrics
import xgboost as xgb
读取数据
df_data = pd.read_csv('/home/mw/input/data5700/cs-training.csv',encoding='GB18030',index_col=0)
df_data.head()
SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines age NumberOfTime30-59DaysPastDueNotWorse DebtRatio MonthlyIncome NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate NumberRealEstateLoansOrLines NumberOfTime60-89DaysPastDueNotWorse NumberOfDependents
1 1 0.76613 45 2 0.80298 9120.00000 13 0 6 0 2.00000
2 0 0.95715 40 0 0.12188 2600.00000 4 0 0 0 1.00000
3 0 0.65818 38 1 0.08511 3042.00000 2 1 0 0 0.00000
4 0 0.23381 30 0 0.03605 3300.00000 5 0 0 0 0.00000
5 0 0.90724 49 1 0.02493 63588.00000 7 0 1 0 0.00000
缺失值检查和处理
null_val_sums = df_data.isnull().sum()
pd.DataFrame({"特征名称": null_val_sums.index, "缺失值数目": null_val_sums.values,
"缺失值占比": null_val_sums.values / len(df_data) })
特征名称 缺失值数目 缺失值占比
0 SeriousDlqin2yrs 0 0.00000
1 RevolvingUtilizationOfUnsecuredLines 0 0.00000
2 age 0 0.00000
3 NumberOfTime30-59DaysPastDueNotWorse 0 0.00000
4 DebtRatio 0 0.00000
5 MonthlyIncome 29731 0.19821
6 NumberOfOpenCreditLinesAndLoans 0 0.00000
7 NumberOfTimes90DaysLate 0 0.00000
8 NumberRealEstateLoansOrLines 0 0.00000
9 NumberOfTime60-89DaysPastDueNotWorse 0 0.00000
10 NumberOfDependents 3924 0.02616
for i in null_val_sums.index:
df_data[i] = df_data[i].fillna(df_data[i].median())
df_data.isnull().sum()
SeriousDlqin2yrs 0
RevolvingUtilizationOfUnsecuredLines 0
age 0
NumberOfTime30-59DaysPastDueNotWorse 0
DebtRatio 0
MonthlyIncome 0
NumberOfOpenCreditLinesAndLoans 0
NumberOfTimes90DaysLate 0
NumberRealEstateLoansOrLines 0
NumberOfTime60-89DaysPastDueNotWorse 0
NumberOfDependents 0
dtype: int64
划分数据集
feature_list = list(df_data)
del feature_list[0]
feature_list
单击部分隐藏输出,双击全隐藏
['RevolvingUtilizationOfUnsecuredLines',
'age',
'NumberOfTime30-59DaysPastDueNotWorse',
'DebtRatio',
'MonthlyIncome',
'NumberOfOpenCreditLinesAndLoans',
'NumberOfTimes90DaysLate',
'NumberRealEstateLoansOrLines',
'NumberOfTime60-89DaysPastDueNotWorse',
'NumberOfDependents']
X_train_T, X_test, y_train_T, y_test = train_test_split(df_data[feature_list], df_data['SeriousDlqin2yrs'], test_size=0.2, random_state=42,stratify=df_data['SeriousDlqin2yrs'])
X_train, X_valid, y_train, y_valid = train_test_split(X_train_T[feature_list], y_train_T, test_size=0.2, random_state=42,stratify=y_train_T)
print('坏样本占比为: ',y_train.mean())
坏样本占比为: 0.06684375
模型训练
eval_set = [(X_valid, y_valid)]
model_xgb = xgb.XGBClassifier(random_state=2022,n_estimators=800).fit(X_train,y_train,early_stopping_rounds=100, eval_metric="auc",eval_set=eval_set)
模型评估
y_pred_train = model_xgb.predict_proba(X_train)[:,1]
fpr_xgb_train,tpr_xgb_train,_=metrics.roc_curve(y_train,y_pred_train)
roc_auc_train = metrics.auc(fpr_xgb_train, tpr_xgb_train)
y_pred = model_xgb.predict_proba(X_test)[:,1]
fpr_xgb_evl,tpr_xgb_evl,_=metrics.roc_curve(y_test,y_pred)
roc_auc_evl = metrics.auc(fpr_xgb_evl, tpr_xgb_evl)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr_xgb_train, tpr_xgb_train, 'b', label = 'Train_AUC = %0.2f' % roc_auc_train)
plt.plot(fpr_xgb_evl, tpr_xgb_evl, 'y', label = 'evl_AUC = %0.2f' % roc_auc_evl)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
模型保存
import pickle #pickle模块
with open('model_xgb.pickle', 'wb') as f:
pickle.dump(model_xgb, f)
模型加载
##模型读取
import pickle
with open('/home/mw/project/model_xgb.pickle', 'rb') as f:
model = pickle.load(f)
数据加载
data = pd.read_csv('/home/mw/input/data5700/cs-training.csv')
data_features = data.iloc[:,2:]
预测函数编写
def pre_data(f1,f2,f3,f4,f5,f6,f7,f8,f9,f10):
##构建输入模型的数据格式
data = {
'RevolvingUtilizationOfUnsecuredLines':f1,
'age':f2,
'NumberOfTime30-59DaysPastDueNotWorse':f3,
'DebtRatio':f4,
'MonthlyIncome':f5,
'NumberOfOpenCreditLinesAndLoans':f6,
'NumberOfTimes90DaysLate':f7,
'NumberRealEstateLoansOrLines':f8,
'NumberOfTime60-89DaysPastDueNotWorse':f9,
'NumberOfDependents':f10}
p_data = pd.DataFrame([data])[['RevolvingUtilizationOfUnsecuredLines','age','NumberOfTime30-59DaysPastDueNotWorse',
'DebtRatio','MonthlyIncome','NumberOfOpenCreditLinesAndLoans','NumberOfTimes90DaysLate','NumberRealEstateLoansOrLines',
'NumberOfTime60-89DaysPastDueNotWorse','NumberOfDependents']]
##模型预测
pre = model.predict(p_data)
r= pre[0]
##获取预测结果
# r = np.argmax(pre,axis=1)[0]
##预测结果转换
if r == 0:
res = '不存在逾期'
else:
res = '存在逾期'
return res
单条预测
print(pre_data(0.02566,38,0,0.47584,3000.00000,7,0,1,0,2.00000))
不存在逾期
批量预测
for i in range(500):
a = list(data_features.iloc[i,:])
print(pre_data(a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7],a[8],a[9]))