数据预处理
def split_data(all_data,label):
# 分label与features
cols=list(filter(lambda item:item !=label,all_data.columns))
#need fix
all_data.fillna(0, inplace=True)
feature_data = np.array(all_data[cols])
label_data = np.array(all_data[[label]])
X, Y = shuffle(feature_data, label_data)
return X, Y
# 数据预处理,保存transform参数为部署模型使用,标准化处理数据 X,x_test=pro_data(X,x_test=x_test,save=True)
def pro_data(X, x_test=None, fit_func=preprocessing.StandardScaler(),save=False):
# 处理数据,可以选择归一化或者正则化
normalizer = StandardScaler()
normalizer.fit(X)
StandardScaler(copy=True, with_mean=True, with_std=True)
X = normalizer.transform(X)
x_mean = normalizer.mean_
x_std = normalizer.var_
if save:
save_modelf('standar',normalizer)
# np.save(str(docs_path['model_info']).format('mean'), x_mean)
# np.save(str(docs_path['model_info']).format('std'), x_std)
if x_test is not None:
x_test = normalizer.transform(x_test)
return X, x_test
else:
return X
#处理不平衡数据
def smote_sample(X, y):
smote_nc = SMOTE()
X_resampled, y_resampled = smote_nc.fit_resample(X, y)
return X_resampled,y_resampled
woe特征数据处理
Cw=CattoWoe('classification')
wclf=Cw.fit(train)
train=wclf.transform()
print(train)
wclf=Cw.fit(test)
test=wclf.transform()
print('*'*20)
print(test)
x_test,y_test=split_data(test,label='classification')
X,x_test=pro_data(X,x_test=x_test,save=False)
X,Y=smote_sample(X,Y)
特征选择
def lasso_func(X, Y, cols, alp, positive=False, line=0, save=False):
Y1 = Y.copy()
Y1[Y1 == 0] = -1
# 存储feature比重
index = np.zeros([X.shape[1], ])
for i in range(100):
# 设定alpha的值 random.uniform 随机生成下一个实数,它在 [x, y] 范围内。
alpha = random.uniform(alp[0], alp[1])
#positive=设为真时,系数为正
clf = Lasso(alpha=alpha, positive=positive)
clf.fit(X, Y1)
data = clf.coef_
data[data != 0] = 1
index = index + data
# 关联列名得到选出的列
df_lasso = pd.DataFrame(index / 100)
# cols = cols
print(df_lasso.shape)
df_lasso = pd.concat([df_lasso, pd.DataFrame(cols)],
ignore_index=True, axis=1)
df_lasso.columns = ['percent', 'col']
df_lasso = df_lasso[df_lasso['percent'] > line]
# ascending=False代表降序
df_lasso = df_lasso.sort_values(by=['percent'], ascending=False)
if save:
df_lasso.to_excel('lasso_select_temp.xlsx')
return df_lasso
import random
from sklearn.linear_model import Lasso
alp = (0, 0.2)
df_lasso = lasso_func(X, Y, cols, alp,save=True)
# 返回随机森林结果
def rf_func(X, Y, cols, n_estimators=200, line=0):
clfs = {'random_forest': RandomForestClassifier(n_estimators=n_estimators)}
clf = clfs['random_forest']
X, Y = shuffle(X, Y)
clf.fit(X, Y)
# feature 需要添加name,df参数
cols = cols
df_rf = pd.DataFrame(clf.feature_importances_)
df_rf = pd.concat([df_rf, pd.DataFrame(cols)], ignore_index=True, axis=1)
df_rf.columns = ['percent', 'col']
df_rf = df_rf[df_rf['percent'] > line]
df_rf = df_rf.sort_values(by=['percent'], ascending=False)
# df_rf.to_excel(r'E:\data\work\company_model\select_features\rf_select.xlsx')
return df_rf
# xgboost效果
def xgb_func(X, Y, cols, num_boost_round=200, line=0):
dtrain = xgb.DMatrix(X, label=Y)
# 定义参数
params = {'booster': 'gbtree',
'objective': 'binary:logistic',
'eval_metric': 'auc',
'seed': 7,
'nthread': 4,
'silent': 1}
watchlist = [(dtrain, 'train')]
bst = xgb.train(
params, dtrain, num_boost_round=num_boost_round, evals=watchlist)
# 取出模型各列的影响值
importance = sorted(bst.get_fscore().items(), key=operator.itemgetter(1))
df_xgb = pd.DataFrame(importance, columns=['feature', 'fscore'])
df_xgb['feature'] = df_xgb['feature'].apply(lambda x: int(x[1:]))
df_xgb.index = df_xgb['feature']
df_xgb = pd.concat([df_xgb, pd.DataFrame(cols)], ignore_index=True, axis=1)
df_xgb.columns = ['feature', 'fscore', 'col']
df_xgb.drop(['feature'], axis=1)
df_xgb = df_xgb[df_xgb['fscore'] > line]
df_xgb.sort_values(by=['fscore'], ascending=False, inplace=True)
# df_xgb.to_excel(r'E:\data\work\company_model\select_features\xgb_select.xlsx')
return df_xgb