数据预处理和特征选择

最新推荐文章于 2022-11-01 16:54:11 发布

置顶 Kyrie_Irving

最新推荐文章于 2022-11-01 16:54:11 发布

阅读量445

点赞数

本文链接：https://blog.csdn.net/Kyrie_Irving/article/details/102507666

版权

数据预处理

def split_data(all_data,label):
    # 分label与features
    cols=list(filter(lambda item:item !=label,all_data.columns))
    #need fix
    all_data.fillna(0, inplace=True)
    feature_data = np.array(all_data[cols])
    label_data = np.array(all_data[[label]])
    X, Y = shuffle(feature_data, label_data)
    return X, Y

# 数据预处理，保存transform参数为部署模型使用，标准化处理数据 X,x_test=pro_data(X,x_test=x_test,save=True)
def pro_data(X, x_test=None, fit_func=preprocessing.StandardScaler(),save=False):
    # 处理数据，可以选择归一化或者正则化
    normalizer = StandardScaler()
    normalizer.fit(X)
    StandardScaler(copy=True, with_mean=True, with_std=True)
    X = normalizer.transform(X)
    x_mean = normalizer.mean_
    x_std = normalizer.var_
    if save:
        save_modelf('standar',normalizer)
        # np.save(str(docs_path['model_info']).format('mean'), x_mean)
        # np.save(str(docs_path['model_info']).format('std'), x_std)
    if x_test is not None:
        x_test = normalizer.transform(x_test)
        return X, x_test
    else:
        return X

#处理不平衡数据
def smote_sample(X, y):
    smote_nc = SMOTE()
    X_resampled, y_resampled = smote_nc.fit_resample(X, y)
    return X_resampled,y_resampled

woe特征数据处理

Cw=CattoWoe('classification')
wclf=Cw.fit(train)
train=wclf.transform()
print(train)
wclf=Cw.fit(test)
test=wclf.transform()
print('*'*20)
print(test)

x_test,y_test=split_data(test,label='classification')
X,x_test=pro_data(X,x_test=x_test,save=False)
 X,Y=smote_sample(X,Y)

特征选择

def lasso_func(X, Y, cols, alp, positive=False, line=0, save=False):
    Y1 = Y.copy()
    Y1[Y1 == 0] = -1
    # 存储feature比重
    index = np.zeros([X.shape[1], ])
    for i in range(100):
        # 设定alpha的值  random.uniform 随机生成下一个实数，它在 [x, y] 范围内。
        alpha = random.uniform(alp[0], alp[1])
        #positive=设为真时，系数为正
        clf = Lasso(alpha=alpha, positive=positive)
        clf.fit(X, Y1)
        data = clf.coef_
        data[data != 0] = 1
        index = index + data
    # 关联列名得到选出的列
    df_lasso = pd.DataFrame(index / 100)
    # cols = cols
    print(df_lasso.shape)

    df_lasso = pd.concat([df_lasso, pd.DataFrame(cols)],
                         ignore_index=True, axis=1)
    df_lasso.columns = ['percent', 'col']
    df_lasso = df_lasso[df_lasso['percent'] > line]
    # ascending=False代表降序
    df_lasso = df_lasso.sort_values(by=['percent'], ascending=False)

    if save:
        df_lasso.to_excel('lasso_select_temp.xlsx')
    return df_lasso

import random
from sklearn.linear_model import Lasso
alp = (0, 0.2)
df_lasso = lasso_func(X, Y, cols, alp,save=True)

在这里插入图片描述

# 返回随机森林结果
def rf_func(X, Y, cols, n_estimators=200, line=0):
    clfs = {'random_forest': RandomForestClassifier(n_estimators=n_estimators)}
    clf = clfs['random_forest']
    X, Y = shuffle(X, Y)
    clf.fit(X, Y)
    # feature 需要添加name，df参数
    cols = cols
    df_rf = pd.DataFrame(clf.feature_importances_)
    df_rf = pd.concat([df_rf, pd.DataFrame(cols)], ignore_index=True, axis=1)
    df_rf.columns = ['percent', 'col']
    df_rf = df_rf[df_rf['percent'] > line]
    df_rf = df_rf.sort_values(by=['percent'], ascending=False)
    # df_rf.to_excel(r'E:\data\work\company_model\select_features\rf_select.xlsx')
    return df_rf
# xgboost效果


def xgb_func(X, Y, cols, num_boost_round=200, line=0):
    dtrain = xgb.DMatrix(X, label=Y)
    # 定义参数
    params = {'booster': 'gbtree',
              'objective': 'binary:logistic',
              'eval_metric': 'auc',
              'seed': 7,
              'nthread': 4,
              'silent': 1}
    watchlist = [(dtrain, 'train')]
    bst = xgb.train(
        params, dtrain, num_boost_round=num_boost_round, evals=watchlist)
    # 取出模型各列的影响值
    importance = sorted(bst.get_fscore().items(), key=operator.itemgetter(1))
    df_xgb = pd.DataFrame(importance, columns=['feature', 'fscore'])
    df_xgb['feature'] = df_xgb['feature'].apply(lambda x: int(x[1:]))
    df_xgb.index = df_xgb['feature']
    df_xgb = pd.concat([df_xgb, pd.DataFrame(cols)], ignore_index=True, axis=1)
    df_xgb.columns = ['feature', 'fscore', 'col']
    df_xgb.drop(['feature'], axis=1)
    df_xgb = df_xgb[df_xgb['fscore'] > line]
    df_xgb.sort_values(by=['fscore'], ascending=False, inplace=True)
    # df_xgb.to_excel(r'E:\data\work\company_model\select_features\xgb_select.xlsx')
    return df_xgb