特征工程

  最近做比赛的一波操作,几乎没有显著效果,虽然在这个比赛没有效果,但是其他比赛可能用的上。先记录一下最近做的特征工程吧。这个可以接着血糖那篇博客。打开血糖预测博客

1.移除特征中的异常值

 

# 移除异常值
    exclude_unique = []
    for c in data.columns:
        num_uniques = len(data[c].unique())
        if data[c].isnull().sum() != 0:
            num_uniques -= 1
        if num_uniques == 1:
            exclude_unique.append(c)
    data = data.drop(exclude_unique, axis=1)

2.移除方差为0的值

 

# 移去方差是0的
    remove = []
    for c in data.columns:
        if data[c].std() == 0:
            remove.append(c)
    data = data.drop(remove, axis=1)

3.多项式进行维度的扩维

 

from sklearn.preprocessing import PolynomialFeatures

 

poly4 = PolynomialFeatures(degree=3)
x_train_poly4 = poly4.fit_transform(train_feat)
x_test_poly4 = poly4.transform(test_feat)
print(x_train_poly4.shape)

4.使用函数进行扩维

 

from sklearn.preprocessing import FunctionTransformer
x_1 = FunctionTransformer(np.log1p).fit_transform(x1)
x_2 = FunctionTransformer(np.log1p).transform(x2)
x_3 = FunctionTransformer(np.log1p).transform(test_feat[predictors])

5.使用PCA进行降维

 

 

from sklearn.decomposition import PCA
pca = PCA(n_components=90)
train_poly4_pca = pca.fit_transform(x_train_poly4)
test_poly4_pca = pca.transform(x_test_poly4)
train_poly4_pca = np.c_[train_poly4_pca, label]
print(test_poly4_pca.shape)

6.使用pearson进行降维

 

 

 #运用 pearson进行降维
for i in range(0, x_train_poly4.shape[1]):
		tmp_corr = np.corrcoef(x_train_poly4[:, i], y1)[0][1]
		print "第", i, "个:", tmp_corr
		if str(tmp_corr) != 'nan':
			# print tmp_corr
			if abs(float(tmp_corr)) >= 0.05:
				new_col.append(i)
print "选择列数", len(new_col)

其中pearson在numpy中,这里还有一个工业AI的降维代码

 

 

for i in train.columns:
    miss_val = train[i].isnull().sum()
    if miss_val < 200:
        if train[i].dtypes != 'object':
            train_values = train[i].dropna().unique()
            if (np.std(train_values) != 0) and (len(train_values) > 1 ):
                if min(train_values) < 9000:
                    # print pearsonr(train[i].values,train['Y'].values)

                    tmp_corr = np.corrcoef(train[i].values,train['Y'].values)[0][1]
                        # print tmp_corr
                    if str(tmp_corr) != 'nan':
                        # print tmp_corr
                        if abs(float(tmp_corr)) >= 0.05:
                            new_col.append(i)
        else:
            new_col.append(i)

这里面包含了一部分数据预处理,感谢大佬提供代码。

 

7.使用SelectKBest包自带的降维

 

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import f_classif
# 特征选择
selectK = SelectKBest(f_regression, k=80)
x_1 = selectK.fit_transform(x_train_poly4, y1)
x_2 = selectK.transform(x_test_poly4)
x_3 = selectK.transform(x_x3_poly4)
print x_1.shape

8. 由于数据的不平衡性,还使用了过采样,

 

 

features = train[features_columns]
train['血糖'] = Binarizer(threshold=10).fit_transform(train['血糖'].values.reshape(-1,1))
labels = train['血糖']
print features.dtypes
#print labels.head(3)

features_train, features_test, labels_train, labels_test = train_test_split(features,
                                                                            labels,
                                                                            test_size=0.2,
                                                                            random_state=0)
oversampler = SMOTE(random_state=0)
os_features, os_labels = oversampler.fit_sample(features_train, labels_train)

# verify new data set is balanced
len(os_labels[os_labels == 1])

clf = RandomForestClassifier(random_state=0)
clf.fit(os_features, os_labels)

# perform predictions on test set
actual = labels_test
predictions = clf.predict(features_test)


   使用了SMOTE,但是这个过采样只能运用label为二分类或者多分类的情况,其他会出现错误。下一步就是将数据先分类,然后在进行回归,由于样本不均衡决定的。

最后,贴上全部瞎鸡巴操作的无效代码:

 

 

#coding: UTF-8
import time
import datetime
import numpy as np
from pandas import DataFrame
import pandas as pd
import lightgbm as lgb
from dateutil.parser import parse
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import norm, skew
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import f_classif
from imblearn.over_sampling import SMOTE


data_path = 'data/'
train = pd.read_csv(data_path+'d_train_20180102.csv')
test = pd.read_csv(data_path+'d_test_A_20180102.csv')

# train['血糖'] = np.log1p(train['血糖'])
# 对血糖进行处理

train = train.drop(train[(train['血糖']>30)].index)
y_label = train['血糖']

HBV = ['乙肝表面抗原',
        '乙肝表面抗体',
        '乙肝e抗原',
        '乙肝e抗体',
        '乙肝核心抗体']
lipid = ['甘油三酯',
         '总胆固醇',
         '高密度脂蛋白胆固醇',
         '低密度脂蛋白胆固醇']
important_feature = [
    '*天门冬氨酸氨基转换酶', '*丙氨酸氨基转换酶', '*碱性磷酸酶', '*r-谷氨酰基转换酶', '*总蛋白', '尿酸',
    '红细胞体积分布宽度', '红细胞平均体积']


def make_feat(train,test):
    train_id = train.id.values.copy()
    test_id = test.id.values.copy()
    data = pd.concat([train, test])
    # data = data.drop(['血糖'], axis=1)

    # 对性别做处理
    data['性别'] = data['性别'].map({'男':1, '女': 0})

    '''
    # 对体检日期做处理
    date = data['体检日期'].min()
    data['体检日期'] = (pd.to_datetime(data['体检日期']) - parse(date)).dt.days
    '''
    data = data.drop(['体检日期'], axis=1)

    # HBV缺失太多 -1 填充
    for hbv in HBV:
        # data[hbv] = data[hbv].fillna(-1)
        data = data.drop([hbv], axis=1)

    # lipid相关性较高,median填充
    for lip in lipid:
        data[lip] = data[lip].fillna(data[lip].median())

    # 移去方差是0的
    remove = []
    for c in data.columns:
        if data[c].std() == 0:
            remove.append(c)
    data = data.drop(remove, axis=1)

    # 移除异常值
    exclude_unique = []
    for c in data.columns:
        num_uniques = len(data[c].unique())
        if data[c].isnull().sum() != 0:
            num_uniques -= 1
        if num_uniques == 1:
            exclude_unique.append(c)
    data = data.drop(exclude_unique, axis=1)

    data = data.fillna(data.median(axis=0))

    '''
    # 预处理规范化
    data = data.drop(['体检日期'], axis=1)
    scaler = StandardScaler()
    scalar = scaler.fit(train)
    scalar.transform(train)
    scalar.transform(test)
    data['体检日期'] = train['体检日期']
    '''

    train_feat = data[data.id.isin(train_id)]
    test_feat = data[data.id.isin(test_id)]

    '''
    total = data.isnull().sum().sort_values(ascending=False)
    percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    print missing_data.head(20)
    '''
    return train_feat, test_feat


train_feat, test_feat = make_feat(train, test)

# 上采样
smote = SMOTE(random_state=0)
train_feature = train_feat.drop(['血糖'], axis=1)
y_train = train_feat['血糖']
train_new, y_new = smote.fit_sample(train_feat.drop(['血糖'], axis=1), y_train)
train_new['血糖'] = y_new
print train.head(3)


'''
# 进行维度的扩充
poly4 = PolynomialFeatures(degree=3)
x_train_poly4 = poly4.fit_transform(train_feat)
x_test_poly4 = poly4.transform(test_feat)
print(x_train_poly4.shape)

# 进行降维
pca = PCA(n_components=90)
train_poly4_pca = pca.fit_transform(x_train_poly4)
test_poly4_pca = pca.transform(x_test_poly4)
label = list(y_label)
train_poly4_pca = np.c_[train_poly4_pca, label]
print(test_poly4_pca.shape)

# 将ndarray转换成dataframe
data1 = np.array(train_poly4_pca)
train_feat = DataFrame(data1, index=range(0, 5641, 1), columns=range(0, 91, 1))
data2 = np.array(test_poly4_pca)
test_feat = DataFrame(data2, index=range(0, 1000, 1), columns=range(0, 90, 1))
'''

predictors = [f for f in test_feat.columns if f not in ['血糖']]


def evalerror(pred, df):
    label = df.get_label().values.copy()
    score = mean_squared_error(label, pred)*0.5
    return ('mse', score, False)

print('开始训练...')
params = {
    'learning_rate': 0.01,
    'slience': 1,
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mse',
    'sub_feature': 0.7,
    'num_leaves': 60,
    'colsample_bytree': 0.7,
    'feature_fraction': 0.7,
    'min_data': 100,
    'min_hessian': 1,
    'verbose': -1,
}

print('开始CV 5折训练...')
scores = []
t0 = time.time()
train_preds = np.zeros(train_feat.shape[0])
test_preds = np.zeros((test_feat.shape[0], 8))
kf = KFold(len(train_feat), n_folds=8, shuffle=True, random_state=520)
for i, (train_index, test_index) in enumerate(kf):
    print('第{}次训练...'.format(i))
    train_feat1 = train_feat.iloc[train_index]
    x1 = train_feat1[predictors]
    y1 = train_feat1['血糖']
    train_feat2 = train_feat.iloc[test_index]
    x2 = train_feat2[predictors]
    y2 = train_feat2['血糖']
    x3 = test_feat[predictors]

    '''
    # PCA降维
    pca = PCA(n_components=41)
    train_poly4_pca = pca.fit_transform(x1)
    test_poly4_pca = pca.transform(x2)
    x3 = pca.transform(test_feat[predictors])
    label = list(y_label)
    '''

    '''
    # 多项式扩展
    poly4 = PolynomialFeatures(degree=2)
    x_train_poly4 = poly4.fit_transform(x1)
    x_test_poly4 = poly4.transform(x2)
    x_x3_poly4 = poly4.transform(x3)
    print(x_train_poly4.shape)

    # print x_train_poly4
    '''

    '''
    new_col = []
    #运用 pearson进行降维
    for i in range(0, x_train_poly4.shape[1]):
            tmp_corr = np.corrcoef(x_train_poly4[:, i], y1)[0][1]
            print "第", i, "个:", tmp_corr
            if str(tmp_corr) != 'nan':
                # print tmp_corr
                if abs(float(tmp_corr)) >= 0.05:
                    new_col.append(i)
    print "选择列数", len(new_col)
    '''

    '''
    x_1 = x_train_poly4[:, new_col]
    x_2 = x_test_poly4[:, new_col]
    x_3 = x_x3_poly4[:, new_col]

    '''

    '''
    # 核函数扩展
    x_1 = FunctionTransformer(np.log1p).fit_transform(x1)
    x_2 = FunctionTransformer(np.log1p).transform(x2)
    x_3 = FunctionTransformer(np.log1p).transform(test_feat[predictors])
    '''
    '''
    # 特征选择
    selectK = SelectKBest(f_regression, k=80)
    x_1 = selectK.fit_transform(x_train_poly4, y1)
    x_2 = selectK.transform(x_test_poly4)
    x_3 = selectK.transform(x_x3_poly4)
    print x_1.shape
    '''

    '''
    # 无钢化
    min_max_scaler = MinMaxScaler()
    train_minmax = min_max_scaler.fit_transform(x1)
    test_minmax = min_max_scaler.transform(x2)
    '''

    lgb_train1 = lgb.Dataset(x1, train_feat1['血糖'])
    lgb_train2 = lgb.Dataset(x2, train_feat2['血糖'])
    gbm = lgb.train(params,
                    lgb_train1,
                    num_boost_round=3000,
                    valid_sets=lgb_train2,
                    verbose_eval=100,
                    feval=evalerror,
                    early_stopping_rounds=180)
    # feat_imp = pd.Series(gbm.feature_importance(), index=predictors).sort_values(ascending=False)
    train_preds[test_index] += gbm.predict(x2)
    # te = min_max_scaler.transform(test_feat[predictors])
    test_preds[:, i] = gbm.predict(x3)
print('线下得分:    {}'.format(mean_squared_error(train_feat['血糖'], train_preds)*0.5))
print('CV训练用时{}秒'.format(time.time() - t0))


submission = pd.DataFrame({'pred': test_preds.mean(axis=1)})
submission.to_csv(r'sub{}.csv'.format(datetime.datetime.now().strftime('%Y%m%d_%H%M%S')), header=None,
                  index=False, float_format='%.4f')

数据和标签的相关图表显示,使用的是pearson系数。

x_cols = [col for col in train_df.columns if col not in ['label'] if train_df[col].dtype != 'object']

labels = []
values = []
for col in x_cols:
    labels.append(col)
    values.append(np.corrcoef(train_df[col].values, train_df.label.values)[0, 1])
corr_df = pd.DataFrame({'col_labels':labels, 'corr_values': values})
corr_df = corr_df.sort_values(by='corr_values')

ind = np.arange(len(labels))
width = 0.5
fig, ax = plt.subplots(figsize=(12,40))
rects = ax.barh(ind, np.array(corr_df.corr_values.values), color='y')
ax.set_yticks(ind)
ax.set_yticklabels(corr_df.col_labels.values, rotation='horizontal')
ax.set_xlabel("Coorelation coefficient")
ax.set_title("Coorelation coefficient of the variables")

如果数据发生了左偏,可以使用log1p进行矫正。

for col in range(1, 19):
    train_df['C_'+str(i)] = train_df['C_'+str(i)].map(lambda x: np.log1p(x))

 

 

 

 

 

 

 

 

 

 




 

 

 

 

 

 

 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值