在前面特征工程章节中https://blog.csdn.net/u010569893/article/details/93380700 讲到了特征工程的概念和常见的特征工程方法,那么今天将介绍下具体的数据挖掘流程。
以下示例代码中包含了大量的特征工程所涉及的步骤,包括:
缺失样本的过滤、空值的填充、异常值处理、计算特征的IV值、特征选择、特征woe编码、训练集合测试集划分、模型训练、模型评估等一系列数据挖掘的过程,欢迎大家学习和拍砖。。。
代码如下:
from pandas import DataFrame, Series
from functools import partial
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split
import numpy as np
import math
from numpy import loadtxt
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score,classification_report,log_loss
def binning(x, nbins=5, strategy='quantile', bin_stat='mean'):
assert strategy in ['uniform', 'quantile']
assert bin_stat in ['mean', 'max', 'min']
_discretizer = KBinsDiscretizer(nbins, strategy=strategy, encode='ordinal')
_result = _discretizer.fit_transform(x.reshape((-1, 1)))
#result = []
#for i in _result:
# result.append(i[0])
#print(result)
return _result, _discretizer
_WOE_MIN = -10
_WOE_MAX = 10
def woe_single_x(x, y, event=1):
nbins=5
if np.unique(x).size <= nbins:
#离散型特征,使用等宽分桶
strategy = 'uniform'
else:
#连续型特征,使用等频分桶
strategy = 'quantile'
x, _discretizer = binning(x, nbins=nbins, strategy=strategy)
event_total, non_event_tota