1.在网格搜索部分其实会过拟合,因为网格搜索优化参数的过程中已经看过了整个训练集的数据然后挑选出来最优参数,接着再用最优参数去拟合训练数据集(相当于建模之前已经偷看了)
2.可以尝试分成三个数据集,训练数据集,验证数据集,测试数据集,用最优参数模型去拟合验证数据集。
导入各种包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_auc_score,roc_curve,auc
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost.sklearn import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
导入数据
data=pd.read_csv('./data.csv',index_col=0,encoding='gbk')
数据理解
#单独提取出y列标签,和其余的88列标记为x
y=data['status']
X=data.drop('status',axis=1)
#X值的行列数,以及y的分布类型
print('X.shape:',X.shape)
print('y的分布:',y.value_counts())
X.shape: (4754, 88)
y的分布: 0 3561
1 1193
Name: status, dtype: int64
数据准备
#首先剔除一些明显无用的特征,如id_name,custid,trade_no,bank_card_no
X.drop(['id_name','custid','trade_no','bank_card_no'],axis=1,inplace=True)
print(X.shape)
#选取数值型特征
X_num=X.select_dtypes('number').copy()
print(X_num