scikit-learn的4.3对数据预处理

4.3.1. Standardization, or mean removal and variance scaling
标准化,去均值,方差可控

#http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing

from sklearn import preprocessing

import numpy as np

X = np.array([[ 1., -1.,  2.],
                [ 2.,  0.,  0.],
               [ 0.,  1., -1.]])
X_scaled = preprocessing.scale(X)#均值为0,方差为1的标准正态分布
X_scaled 

>>>

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

X_scaled.mean(axis=0)   #array([ 0., 0., 0.])

X_scaled.std(axis=0) #array([ 1., 1., 1.])

scaler = preprocessing.StandardScaler().fit(X)#构建正态分布器,拟合X,目的是找到均值和方差
scaler#
StandardScaler(copy=True, with_mean=True, with_std=True)

scaler.mean_  #
array([ 1. , 0. , 0.33333333])
scaler.scale_  #
array([ 0.81649658, 0.81649658, 1.24721913])

scaler.transform(X)  # 用特定的均值方差转化为标准正态分布的值
>>>

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])
scaler.transform([[-1.,  1., 0.]])#对新的数据进行特定的均值方差转化处理 array([[-2.44948974, 1.22474487, -0.26726124]])

# 4.3.1.1. Scaling features to a range
#调整特征的范围MinMaxScaler或 MaxAbsScaler,目的是针对标准差很小的特征或者想保留特征中的0(稀有矩阵)
X_train = np.array([[ 1., -1.,  2.],
                      [ 2.,  0.,  0.],
                     [ 0.,  1., -1.]])
min_max_scaler = preprocessing.MinMaxScaler()#MinMaxScaler(copy=True, feature_range=(0, 1))
X_train_minmax = min_max_scaler.fit_transform(X_train)#fit,trainsform2步合并成一步
X_train_minmax

>>>

array([[ 0.5       ,  0.        ,  1.        ],
       [ 1.        ,  0.5       ,  0.33333333],
       [ 0.        ,  1.        ,  0.        ]])
X_test = np.array([[ -3., -1.,  4.]])
X_test_minmax = min_max_scaler.transform(X_test)
X_test_minmax#array([[-1.5       ,  0.        ,  1.66666667]])
min_max_scaler.scale_ #=1/(X_train.max(axis=0)-X_train.min(axis=0))array([ 0.5       ,  0.5       ,  0.33333333])
min_max_scaler.min_  #array([ 0.        ,  0.5       ,  0.33333333])
X_train = np.array([[ 1., -1.,  2.],
                      [ 2.,  0.,  0.],
                     [ 0.,  1., -1.]])
max_abs_scaler = preprocessing.MaxAbsScaler()#将最大值转化为1的方式,转化因子1/X_train(axis=0)
X_train_maxabs = max_abs_scaler.fit_transform(X_train)
X_train_maxabs 
>>>
array([[ 0.5, -1. ,  1. ],
       [ 1. ,  0. ,  0. ],
       [ 0. ,  1. , -0.5]])
X_test = np.array([[ -3., -1.,  4.]])
X_test_maxabs = max_abs_scaler.transform(X_test)
X_test_maxabs #array([[-1.5, -1. ,  2. ]])
max_abs_scaler.scale_   #array([ 2.,  1.,  2.])

4.3.1.2. Scaling sparse data不解释
4.3.1.3. Scaling data with outliers#用robust_scale、RobustScaler
4.3.1.4. Centering kernel matrices不解释
# 4.3.2. Normalization归化
#用于文本分类或聚类分析,适用于稀疏矩阵
#preprocessing.normalize(X, norm='l2', axis=1, copy=True)是对默认列(axis=1)进行向量归一化 
 X_train = np.array([[ 1., -1.,  2.],
                      [ 2.,  0.,  0.],
                     [ 0.,  1., -1.]])
X_normalized = preprocessing.normalize(X, norm='l2')
X_normalized   
>>>
array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])
normalizer = preprocessing.Normalizer().fit(X)
normalizer#Normalizer(copy=True, norm='l2')
normalizer.transform(X)   
>>>
array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])
normalizer.transform([[-1.,  1., 0.]])    #array([[-0.70710678, 0.70710678, 0. ]])
4.3.3. Binarization#通过设阈值转化为二项
X_train = np.array([[ 1., -1.,  2.],
                      [ 2.,  0.,  0.],
                     [ 0.,  1., -1.]])
binarizer = preprocessing.Binarizer().fit(X)
binarizer#Binarizer(copy=True, threshold=0.0)
binarizer.transform(X)
>>>
array([[ 1.,  0.,  1.],
       [ 1.,  0.,  0.],
       [ 0.,  1.,  0.]])
binarizer = preprocessing.Binarizer(threshold=1.1)#设置阈值
binarizer.transform(X)
>>>
array([[ 0.,  0.,  1.],
       [ 1.,  0.,  0.],
       [ 0.,  0.,  0.]])
4.3.4. Encoding categorical features
#编码种类化特征 OneHotEncoder
enc = preprocessing.OneHotEncoder()
enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]) 
#对数据进行one-hot编码,比如第3列有0、1、2、3把他们编码成最后4列 
>>>
OneHotEncoder(categorical_features='all', dtype=<class 'float'>,
       handle_unknown='error', n_values='auto', sparse=True)
enc.transform([[0, 1, 3]]).toarray()#array([[ 1., 0., 0., 1., 0., 0., 0., 0., 1.]])
enc = preprocessing.OneHotEncoder(n_values=[2, 3, 4])#对编码列数的要求
enc.fit([[1, 2, 3], [0, 2, 0]])  
enc.transform([[1, 0, 0]]).toarray()#array([[ 0.,  1.,  1.,  0.,  0.,  1.,  0.,  0.,  0.]])
label encoding
#来源于python real world machine learning
label_encoder=preprocessing.LabelEncoder()
input_classes=['audi','ford','audi','toyota', 'ford','bmw']
label_encoder.fit(input_classes)
print('\nCLass mapping:')
for i,item in enumerate(label_encoder.classes_):
    print(item,'-->',i)
>>>
CLass mapping:
audi --> 0
bmw --> 1
ford --> 2
toyota --> 3

labels=['toyota','ford','audi']
encoded_labels = label_encoder.transform(labels)
print("\nLabels =", labels)
print("Encoded labels =", list(encoded_labels))
>>>
Labels = ['toyota', 'ford', 'audi']
Encoded labels = [3, 2, 0]
encoded_labels = [2, 1, 0, 3, 1]
decoded_labels = label_encoder.inverse_transform(encoded_labels)#还原解码
print("\nEncoded labels =", encoded_labels)
print("Decoded labels =", list(decoded_labels))
>>>
 
Encoded labels = [2, 1, 0, 3, 1]
Decoded labels = ['ford', 'bmw', 'audi', 'toyota', 'bmw']
4.3.5. Imputation of missing values
#处理缺失值
import numpy as np
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)#用均值填充
imp.fit([[1, 2], [np.nan, 3], [7, 6]])
>>>
Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)
X = [[np.nan, 2], [6, np.nan], [7, 6]]
print(imp.transform(X))
>>>
[[ 4.          2.        ]
 [ 6.          3.66666667]
 [ 7.          6.        ]]
import scipy.sparse as spX = sp.csc_matrix([[1, 2], [0, 3], [7, 6]])imp = Imputer(missing_values=0, strategy='mean', axis=0)#稀疏矩阵的处理imp.fit(X)X_test = sp.csc_matrix([[0, 2], [6, 0], [7, 6]])print(imp.transform(X_test))   
>>>
[[ 4.          2.        ]
 [ 6.          3.66666675]
 [ 7.          6.        ]]
4.3.6. Generating polynomial features#生成多项式特征
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
X = np.arange(6).reshape(3, 2)
poly = PolynomialFeatures(2)
poly.fit_transform(X) # from(X_1, X_2) to (1, X_1, X_2, X_1^2, X_1X_2, X_2^2).  
array([[  1.,   0.,   1.,   0.,   0.,   1.],
       [  1.,   2.,   3.,   4.,   6.,   9.],
       [  1.,   4.,   5.,  16.,  20.,  25.]])
X = np.arange(9).reshape(3, 3)
poly = PolynomialFeatures(degree=3, interaction_only=True)
poly.fit_transform(X)# from (X_1, X_2, X_3) to (1, X_1, X_2, X_3, X_1X_2, X_1X_3, X_2X_3, X_1X_2X_3)   
>>>
array([[   1.,    0.,    1.,    2.,    0.,    0.,    2.,    0.],
       [   1.,    3.,    4.,    5.,   12.,   15.,   20.,   60.],
       [   1.,    6.,    7.,    8.,   42.,   48.,   56.,  336.]])
4.3.7. Custom transformers
#定制转换函数
import numpy as np
from sklearn.preprocessing import FunctionTransformer
transformer = FunctionTransformer(np.log1p)
X = np.array([[0, 1], [2, 3]])
transformer.transform(X)
>>>array([[ 0.        ,  0.69314718],
       [ 1.09861229,  1.38629436]])
http://scikit-learn.org/stable/auto_examples/preprocessing/plot_function_transformer.html#sphx-glr-auto-examples-preprocessing-plot-function-transformer-pyUsing FunctionTransformer to select columns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
def _generate_vector(shift=0.5, noise=15):
    return np.arange(1000) + (np.random.rand(1000) - shift) * noise
def generate_dataset():
    """
    This dataset is two lines with a slope ~ 1, where one has
    a y offset of ~100
    """
    return np.vstack((#vstack,vertical stack行堆积
        np.vstack((
            _generate_vector(),
            _generate_vector() + 100,
        )).T,
        np.vstack((
            _generate_vector(),
            _generate_vector(),
        )).T,
    )), np.hstack((np.zeros(1000), np.ones(1000)))#按列推积




def all_but_first_column(X):
    return X[:, 1:]




def drop_first_component(X, y):
    """
    Create a pipeline with PCA and the column selector and use it to
    transform the dataset.
    """
    pipeline = make_pipeline(
        PCA(), FunctionTransformer(all_but_first_column),
    )
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    pipeline.fit(X_train, y_train)
    return pipeline.transform(X_test), y_test
if __name__ == '__main__':
    X, y = generate_dataset()
    plt.figure(figsize=(6,4))
    plt.scatter(X[:, 0], X[:, 1], c=y, s=30)
    plt.show()
    X_transformed, y_transformed = drop_first_component(*generate_dataset())
    plt.scatter(
        X_transformed[:, 0],
        np.zeros(len(X_transformed)),
        c=y_transformed,
        s=50,
    )
    plt.show()

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值