scikit-learn的4.3对数据预处理

最新推荐文章于 2023-10-10 17:45:37 发布

sinat_24395003

最新推荐文章于 2023-10-10 17:45:37 发布

阅读量625

点赞数 1

分类专栏： scikit-learn 文章标签： data preprocessing

scikit-learn 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

4.3.1. Standardization, or mean removal and variance scaling
标准化，去均值，方差可控

#http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing

from sklearn import preprocessing

import numpy as np

X = np.array([[ 1., -1., 2.],
[ 2., 0., 0.],
[ 0., 1., -1.]])
X_scaled = preprocessing.scale(X)#均值为0，方差为1的标准正态分布
X_scaled

>>>

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

X_scaled.mean(axis=0) #array([ 0., 0., 0.])

X_scaled.std(axis=0) #array([ 1., 1., 1.])

scaler = preprocessing.StandardScaler().fit(X)#构建正态分布器，拟合X，目的是找到均值和方差
scaler#StandardScaler(copy=True, with_mean=True, with_std=True)

scaler.mean_ #array([ 1. , 0. , 0.33333333])
scaler.scale_ #array([ 0.81649658, 0.81649658, 1.24721913])

scaler.transform(X) # 用特定的均值方差转化为标准正态分布的值
>>>

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

scaler.transform([[-1., 1., 0.]])#对新的数据进行特定的均值方差转化处理 array([[-2.44948974, 1.22474487, -0.26726124]])

# 4.3.1.1. Scaling features to a range
#调整特征的范围MinMaxScaler或 MaxAbsScaler，目的是针对标准差很小的特征或者想保留特征中的0（稀有矩阵）
X_train = np.array([[ 1., -1., 2.],
[ 2., 0., 0.],
[ 0., 1., -1.]])
min_max_scaler = preprocessing.MinMaxScaler()#MinMaxScaler(copy=True, feature_range=(0, 1))
X_train_minmax = min_max_scaler.fit_transform(X_train)#fit,trainsform2步合并成一步
X_train_minmax

>>>

array([[ 0.5       ,  0.        ,  1.        ],
       [ 1.        ,  0.5       ,  0.33333333],
       [ 0.        ,  1.        ,  0.        ]])

X_test = np.array([[ -3., -1.,  4.]])
X_test_minmax = min_max_scaler.transform(X_test)
X_test_minmax#array([[-1.5       ,  0.        ,  1.66666667]])

min_max_scaler.scale_ #=1/（X_train.max(axis=0)-X_train.min(axis=0)）array([ 0.5       ,  0.5       ,  0.33333333])

min_max_scaler.min_  #array([ 0.        ,  0.5       ,  0.33333333])

X_train = np.array([[ 1., -1.,  2.],
                      [ 2.,  0.,  0.],
                     [ 0.,  1., -1.]])
max_abs_scaler = preprocessing.MaxAbsScaler()#将最大值转化为1的方式，转化因子1/X_train（axis=0）
X_train_maxabs = max_abs_scaler.fit_transform(X_train)
X_train_maxabs 
>>>array([[ 0.5, -1. ,  1. ],
       [ 1. ,  0. ,  0. ],
       [ 0. ,  1. , -0.5]])
X_test = np.array([[ -3., -1.,  4.]])
X_test_maxabs = max_abs_scaler.transform(X_test)
X_test_maxabs #array([[-1.5, -1. ,  2. ]])
max_abs_scaler.scale_   #array([ 2.,  1.,  2.])

4.3.1.2. Scaling sparse data不解释
4.3.1.3. Scaling data with outliers#用robust_scale、RobustScaler
4.3.1.4. Centering kernel matrices不解释
# 4.3.2. Normalization归化
#用于文本分类或聚类分析，适用于稀疏矩阵
#preprocessing.normalize(X, norm='l2', axis=1, copy=True)是对默认列（axis=1）进行向量归一化 
 X_train = np.array([[ 1., -1.,  2.],
                      [ 2.,  0.,  0.],
                     [ 0.,  1., -1.]])
X_normalized = preprocessing.normalize(X, norm='l2')
X_normalized   
>>>array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])
normalizer = preprocessing.Normalizer().fit(X)
normalizer#Normalizer(copy=True, norm='l2')
normalizer.transform(X)   
>>>array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])
normalizer.transform([[-1.,  1., 0.]])    #array([[-0.70710678,  0.70710678,  0.        ]])
4.3.3. Binarization#通过设阈值转化为二项
X_train = np.array([[ 1., -1.,  2.],
                      [ 2.,  0.,  0.],
                     [ 0.,  1., -1.]])
binarizer = preprocessing.Binarizer().fit(X)
binarizer#Binarizer(copy=True, threshold=0.0)
binarizer.transform(X)
>>>array([[ 1.,  0.,  1.],
       [ 1.,  0.,  0.],
       [ 0.,  1.,  0.]])
binarizer = preprocessing.Binarizer(threshold=1.1)#设置阈值
binarizer.transform(X)
>>>
array([[ 0.,  0.,  1.],
       [ 1.,  0.,  0.],
       [ 0.,  0.,  0.]])
4.3.4. Encoding categorical features
#编码种类化特征 OneHotEncoder
enc = preprocessing.OneHotEncoder()
enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]) 
#对数据进行one-hot编码，比如第3列有0、1、2、3把他们编码成最后4列 
>>>
OneHotEncoder(categorical_features='all', dtype=<class 'float'>,
       handle_unknown='error', n_values='auto', sparse=True)
enc.transform([[0, 1, 3]]).toarray()#array([[ 1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.]])
enc = preprocessing.OneHotEncoder(n_values=[2, 3, 4])#对编码列数的要求
enc.fit([[1, 2, 3], [0, 2, 0]])  
enc.transform([[1, 0, 0]]).toarray()#array([[ 0.,  1.,  1.,  0.,  0.,  1.,  0.,  0.,  0.]])
label encoding
#来源于python real world machine learning
label_encoder=preprocessing.LabelEncoder()
input_classes=['audi','ford','audi','toyota', 'ford','bmw']
label_encoder.fit(input_classes)
print('\nCLass mapping:')
for i,item in enumerate(label_encoder.classes_):
    print(item,'-->',i)
>>>
CLass mapping:
audi --> 0
bmw --> 1
ford --> 2
toyota --> 3

labels=['toyota','ford','audi']
encoded_labels = label_encoder.transform(labels)
print("\nLabels =", labels)
print("Encoded labels =", list(encoded_labels))
>>>
Labels = ['toyota', 'ford', 'audi']
Encoded labels = [3, 2, 0]
encoded_labels = [2, 1, 0, 3, 1]
decoded_labels = label_encoder.inverse_transform(encoded_labels)#还原解码
print("\nEncoded labels =", encoded_labels)
print("Decoded labels =", list(decoded_labels))

>>>

Encoded labels = [2, 1, 0, 3, 1]
Decoded labels = ['ford', 'bmw', 'audi', 'toyota', 'bmw']

4.3.5. Imputation of missing values
#处理缺失值

import numpy as np
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)#用均值填充
imp.fit([[1, 2], [np.nan, 3], [7, 6]])

>>>

Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)

X = [[np.nan, 2], [6, np.nan], [7, 6]]
print(imp.transform(X))

>>>
[[ 4.          2.        ]
 [ 6.          3.66666667]
 [ 7.          6.        ]]
import scipy.sparse as spX = sp.csc_matrix([[1, 2], [0, 3], [7, 6]])imp = Imputer(missing_values=0, strategy='mean', axis=0)#稀疏矩阵的处理imp.fit(X)X_test = sp.csc_matrix([[0, 2], [6, 0], [7, 6]])print(imp.transform(X_test))

>>>

[[ 4.          2.        ]
 [ 6.          3.66666675]
 [ 7.          6.        ]]
4.3.6. Generating polynomial features#生成多项式特征

import numpy as np
from sklearn.preprocessing import PolynomialFeatures
X = np.arange(6).reshape(3, 2)
poly = PolynomialFeatures(2)
poly.fit_transform(X) # from(X_1, X_2) to (1, X_1, X_2, X_1^2, X_1X_2, X_2^2).

array([[  1.,   0.,   1.,   0.,   0.,   1.],
       [  1.,   2.,   3.,   4.,   6.,   9.],
       [  1.,   4.,   5.,  16.,  20.,  25.]])
X = np.arange(9).reshape(3, 3)
poly = PolynomialFeatures(degree=3, interaction_only=True)
poly.fit_transform(X)# from (X_1, X_2, X_3) to (1, X_1, X_2, X_3, X_1X_2, X_1X_3, X_2X_3, X_1X_2X_3)   
>>>
array([[   1.,    0.,    1.,    2.,    0.,    0.,    2.,    0.],
       [   1.,    3.,    4.,    5.,   12.,   15.,   20.,   60.],
       [   1.,    6.,    7.,    8.,   42.,   48.,   56.,  336.]])
4.3.7. Custom transformers
#定制转换函数
import numpy as np
from sklearn.preprocessing import FunctionTransformer
transformer = FunctionTransformer(np.log1p)
X = np.array([[0, 1], [2, 3]])
transformer.transform(X)
>>>array([[ 0.        ,  0.69314718],
       [ 1.09861229,  1.38629436]])
http://scikit-learn.org/stable/auto_examples/preprocessing/plot_function_transformer.html#sphx-glr-auto-examples-preprocessing-plot-function-transformer-pyUsing FunctionTransformer to select columns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
def _generate_vector(shift=0.5, noise=15):
    return np.arange(1000) + (np.random.rand(1000) - shift) * noise
def generate_dataset():
    """
    This dataset is two lines with a slope ~ 1, where one has
    a y offset of ~100
    """
    return np.vstack((#vstack,vertical stack行堆积
        np.vstack((
            _generate_vector(),
            _generate_vector() + 100,
        )).T,
        np.vstack((
            _generate_vector(),
            _generate_vector(),
        )).T,
    )), np.hstack((np.zeros(1000), np.ones(1000)))#按列推积




def all_but_first_column(X):
    return X[:, 1:]




def drop_first_component(X, y):
    """
    Create a pipeline with PCA and the column selector and use it to
    transform the dataset.
    """
    pipeline = make_pipeline(
        PCA(), FunctionTransformer(all_but_first_column),
    )
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    pipeline.fit(X_train, y_train)
    return pipeline.transform(X_test), y_test
if __name__ == '__main__':
    X, y = generate_dataset()
    plt.figure(figsize=(6,4))
    plt.scatter(X[:, 0], X[:, 1], c=y, s=30)
    plt.show()
    X_transformed, y_transformed = drop_first_component(*generate_dataset())
    plt.scatter(
        X_transformed[:, 0],
        np.zeros(len(X_transformed)),
        c=y_transformed,
        s=50,
    )
    plt.show()

sinat_24395003

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
scikit-learn的4.3对数据预处理

4.3.1. Standardization, or mean removal and variance scaling标准化，去均值，方差可控#http://scikit-learn.org/stable/modules/preprocessing.html#preprocessingfrom sklearn import preprocessingimport numpy as
复制链接

扫一扫