背景
目的是对客户下个月是否违约做出预测。原始数据格式是csv,一共有25个列。
数据和源代码在这里
字段描述
1. 数据概览
# 所有需要用到的包
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix
import numpy as np
from sklearn.utils.multiclass import unique_labels
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
%matplotlib inline
np.set_printoptions(precision=2)
df=pd.read_csv('D:/data/data/training.csv')
df.rename(columns={'payment flag':'payment_flag'},inplace=True)
# 1. 数据概览
# ID这个字段没有用
df.drop(['ID'], inplace=True, axis =1)
print('数据大小:'+str(df.shape))
print('字段类型:\n',df.dtypes)
print('字段描述\n',df.describe())
2. 数据预处理
2.1 是否存在缺失值?
if df.isnull().sum().sum()>0:
df=df.dropna()
print('已删除缺失值!')
print('删除后数据大小'+str(df.shape))
else:
print('无缺失值')
2.2 删除重复记录
if df.duplicated().sum()>0:
df=df.drop_duplicates(inplace=False)
print('已删除重复记录!')
print('删除后数据大小'+str(df.shape))
else:
print('无重复记录!')
2.3 删除不符合逻辑的数据记录
# 通过查看字段的基本信息并对比给定的data_describe文件的信息
# 2.3.1 发现EDUCATION教育程度存在不符合给定的字段描述数据,如EDUCATION=0 or 6
print(df['EDUCATION'].value_counts())
# 将0,4,5,6都视为“其他”,表示低于高中的学历,并统一赋值为0
df['EDUCATION'].replace([4,5,6],[0,0,0],inplace=True)
print(df['EDUCATION'].value_counts())
# 2.3.2 MARRIAGE婚姻状况存在0
print(df['MARRIAGE'].value_counts())
# 将0,3都视为“其他”,并统一赋值为0
df['MARRIAGE'].replace(3,0,inplace=True)
print(df['MARRIAGE'].value_counts())
# 2.3.3 PAY_0、PAY_2-PAY_6都出现了-2的数据记录,以PAY_0为例
print((df['PAY_0'].value_counts()))
# 将-2、-1、0都视为“还清债务”,并统一赋值为0
df.iloc[:,[5,6,7,8,9,10]] = df.iloc[:,[5,6,7,8,9,10]].replace([-2,-1],[0,0])
print(df.iloc[:,[5,6,7,8,9,10]].describe())
2.4 查看不同标签比例
flag=df['payment_flag'].value_counts()
print('下个月是否违约人数比(违约:1;未违约:0)\n',flag)
print('---------------数据预处理完成!---------------')
3. 数据探索
flag_0 = df[df['payment_flag'] == 0]
flag_1 = df[df['payment_flag'] == 1]
sns.set(rc={'figure.figsize':(10,5),'font.size':15,'font.sans-serif':['SimHei'],'axes.titlesize':15,'axes.labelsize':15})
plt.title('不同信用额度违约情况图')
sns.set_color_codes('pastel')
sns.distplot(flag_0['LIMIT_BAL'],kde=True,bins=200, color='blue')
sns.distplot(flag_1['LIMIT_BAL'],kde=True,bins=200, color='red')
plt.xlabel('信用额度')
plt.show()
print('从整体来看,大部分的用户信用额度集中在10万以内,并且低额度的用户违约概率更高,高额度的用户违约概率较低。临界点在11万左右。')
flag_0 = df[df['payment_flag'] == 0]
flag_1 = df[df['payment_flag'] == 1]
sns.set(rc={'figure.figsize':(10,5),'font.size':15,'font.sans-serif':['SimHei'],'axes.titlesize':15,'axes.labelsize':15})
plt.title('不同年龄段违约情况图')
sns.set_color_codes('pastel')
sns.distplot(flag_0['AGE'],kde=True,bins=200, color='blue')
sns.distplot(flag_1['AGE'],kde=True,bins=200, color='red')
plt.xlabel('年龄')
plt.show()
print('从年龄分布上来看,用户主要集中在20-40岁之间,且26-45岁的用户违约率较低,其余年龄段违约概率相比较高')
fig=plt.subplots(1,2, figsize=(16,12))
sns.set(rc={'figure.figsize':(10,5),"font.size":15,'font.sans-serif':['SimHei'],"axes.titlesize":15,"axes.labelsize":15})
plt.subplot(1,2,1)
plt.title('不同性别违约情况图')
sns.countplot(x='SEX', hue='payment_flag', data=df,palette='magma').set_xticklabels(['男', '女'],fontsize=12)
# sns.countplot.
plt.xlabel("性别")
plt.subplot(1,2,2)
plt.title('不同婚姻状况违约情况图')
sns.countplot(x='MARRIAGE', hue='payment_flag', data=df,palette='magma').set_xticklabels(['其他', '已婚','单身'],fontsize=12)
plt.xlabel("婚姻状况")
plt.show()
print('虽然女性用户高于男性用户,但在违约数量上并没有明显高于男性用户,说明男性用户的违约概率要大一些。')
print('同理,已婚用户要比未婚用户的违约率大一些。')
sns.set(rc={'figure.figsize':(10,5),"font.size":15,'font.sans-serif':['SimHei'],"axes.titlesize":15,"axes.labelsize":1})
# sns.set()
plt.title('不同教育程度违约情况图')
sns.countplot(x='EDUCATION', hue='payment_flag', data=df,palette='magma').set_xticklabels(['其他','高中','大学', '研究生及以上'],fontsize=12)
plt.xlabel("教育程度")
plt.show()
print('从该数据源可以暂时推断大学及以上的用户是信用卡的目标用户。并且大学及以上的用户违约概率会略小于高中用户群体。')
print('---------------数据探索完成!---------------')
4. 模型比较
4.1封装的函数
4.1.1 重采样
from imblearn.over_sampling import SMOTE
def smo(df_x,y, columns):
print('采样前')
print('payment flag=0:'+str(y[y['payment_flag']<1].shape))
print('payment flag=1:'+str(y[y['payment_flag']==1].shape))
smo = SMOTE()
df_x_res, y_res = smo.fit_resample(df_x, y)
df_res = pd.concat([pd.DataFrame(df_x_res), pd.DataFrame(y_res)], axis=1)
print('采样后')
print('payment flag=0:'+str(y_res[y_res['payment_flag']<1].shape))
print('payment flag=1 :'+str(y_res[y_res['payment_flag']==1].shape))
print('------------重采样完成!------------')
return df_res, df_x_res, y_res
4.1.2 模型评估
def evaluate(model_name,model,x_test,y_test):
# y_pred=model.predict(x_test)
predicted = model.predict(x_test)
accuracy = model.score(x_test, y_test)
# auc_roc=roc_auc_score()
print(model_name+'的准确率:'+str(accuracy))
pred=model.predict(x_test)
print(classification_report(y_test, pred))
4.2 建模
哑变量处理
如果你的模型对无序的分类变量比较敏感就要做这一步的操作,这里我没做。
# new_colunms=['SEX','MARRIAGE']
# df=pd.get_dummies(df,columns=new_colunms)
df_x=df.drop(['payment_flag'],axis=1)
y=df['payment_flag']
y=pd.DataFrame(y, columns=['payment_flag'])
columns=df.columns.tolist()
x_train, x_test, y_train, y_test = train_test_split(df_x, y, test_size=0.3,random_state=1)
# df_res, x_train, y_train=smo(x_train,y_train, columns)
from sklearn.preprocessing import StandardScaler
stdScale = StandardScaler().fit(x_train) ## 生成规则
x_train = stdScale.transform(x_train) ## 将规则应用于训练集
x_test = stdScale.transform(x_test) ## 将规则应用于测试集
LR=LogisticRegression()
LR.fit(x_train, y_train)
evaluate('逻辑回归',LR,x_test,y_test)
RF=RandomForestClassifier()
RF.fit(x_train, y_train)
evaluate('随机森林',RF,x_test,y_test)
GBC=GradientBoostingClassifier()
GBC.fit(x_train, y_train)
evaluate('GBDT',GBC,x_test,y_test)
AdB= AdaBoostClassifier()
AdB.fit(x_train, y_train)
evaluate('Adboost',AdB,x_test,y_test)
XGB=XGBClassifier()
XGB.fit(x_train, y_train)
evaluate('xgboost',XGB,x_test,y_test)
5. 选择GBDT调参
我搜了好几个GBDT的调参,都是一步一步的调参,然后有试过把下面的几个参数放在一次做网格搜索,但是感觉很慢慢慢慢。。。。跑了一个晚上吧。难道是因为这个原因所以一步一步调参?不懂,有无大佬可以教教我
5.1 首先选择较高的步长,大概0.1,加快收敛速度。
5.1.1对基学习期的数目进行调参
from sklearn.model_selection import GridSearchCV
param_test1 = {'n_estimators':range(10,150,10)}
gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1,random_state=10), param_grid = param_test1, scoring='accuracy',n_jobs=4, cv=5)
gsearch1.fit(x_train, y_train)
means=gsearch1.cv_results_['mean_test_score']
params=gsearch1.cv_results_['params']
for mean,param in zip(means,params):
print('%f %r'%(mean,param))
print(gsearch1.best_params_, gsearch1.best_score_)
5.1.2 对树的高度,内部节点划分最小样本数进行调参
param_test2 = {'max_depth':range(2,6,1), 'min_samples_split':range(800,2001,200)}
gsearch2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1,
n_estimators=50, random_state=10),param_grid = param_test2, scoring='accuracy',n_jobs=4,cv=5)
gsearch2.fit(x_train, y_train)
means=gsearch2.cv_results_['mean_test_score']
params=gsearch2.cv_results_['params']
for mean,param in zip(means,params):
print('%f %r'%(mean,param))
gsearch2.best_params_, gsearch2.best_score_
5.1.3对内部节点再划分所需最小样本数和叶子结点最少样本数一起调参
param_test3 = {'min_samples_split':range(800,2000,200), 'min_samples_leaf':range(10,101,20)}
gsearch3 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1,
n_estimators=50,max_depth=3, random_state=10),
param_grid = param_test3,
scoring='accuracy',n_jobs=4,cv=5)
gsearch3.fit(x_train, y_train)
means=gsearch3.cv_results_['mean_test_score']
params=gsearch3.cv_results_['params']
for mean,param in zip(means,params):
print('%f %r'%(mean,param))
gsearch3.best_params_, gsearch3.best_score_
# 5.1.4 调参后的模型
gbm_tuned_0 = GradientBoostingClassifier(learning_rate=0.1,
n_estimators=50,max_depth=3, min_samples_split=1600,min_samples_leaf=30,
random_state=10)
gbm_tuned_0.fit(x_train, y_train)
evaluate('gbdt',gbm_tuned_0, x_test,y_test)
5.2 对步长进行调参
5.2.1 减小步长,增大迭代次数,泛化模型
gbm_tuned_1 = GradientBoostingClassifier(learning_rate=0.05,
n_estimators=100,max_depth=3, min_samples_split=1600,min_samples_leaf=30,
random_state=10)
gbm_tuned_1.fit(x_train, y_train)
evaluate('gbdt',gbm_tuned_1, x_test,y_test)
5.2.2 继续减小步长,增大迭代次数,泛化模型
gbm_tuned_2 = GradientBoostingClassifier(learning_rate=0.01,
n_estimators=500,max_depth=3, min_samples_split=1600,min_samples_leaf=30,
random_state=10)
gbm_tuned_2.fit(x_train, y_train)
evaluate('gbdt',gbm_tuned_2, x_test,y_test)
6. 输出结果
test_data=pd.read_csv('D:/data/data/testing.csv')
test_data.drop(['ID'], inplace=True, axis =1)
columns=test_data.columns.values.tolist()
test_data = stdScale.transform(test_data) ## 将规则应用于测试集
y_pre=gbm_tuned_2.predict(test_data)
test_data=pd.DataFrame(test_data,columns=columns)
test_data['payment_flag']=y_pre
y_pre=pd.DataFrame(y_pre,columns=['payment flag'])
y_pre.to_csv('D:/data/data/python_task_csv.csv')