标准化,去均值,方差可控
#http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing
from sklearn import preprocessing
import numpy as np
X = np.array([[ 1., -1., 2.],
[ 2., 0., 0.],
[ 0., 1., -1.]])
X_scaled = preprocessing.scale(X)#均值为0,方差为1的标准正态分布
X_scaled
>>>
array([[ 0. , -1.22474487, 1.33630621], [ 1.22474487, 0. , -0.26726124], [-1.22474487, 1.22474487, -1.06904497]])
X_scaled.mean(axis=0) #array([ 0., 0., 0.])
X_scaled.std(axis=0) #array([ 1., 1., 1.])
scaler = preprocessing.StandardScaler().fit(X)#构建正态分布器,拟合X,目的是找到均值和方差
scaler#StandardScaler(copy=True, with_mean=True, with_std=True)
scaler.mean_ #array([ 1. , 0. , 0.33333333])
scaler.scale_ #array([ 0.81649658, 0.81649658, 1.24721913])
scaler.transform(X) # 用特定的均值方差转化为标准正态分布的值
>>>
array([[ 0. , -1.22474487, 1.33630621], [ 1.22474487, 0. , -0.26726124], [-1.22474487, 1.22474487, -1.06904497]])scaler.transform([[-1., 1., 0.]])#对新的数据进行特定的均值方差转化处理 array([[-2.44948974, 1.22474487, -0.26726124]])
# 4.3.1.1. Scaling features to a range
#调整特征的范围MinMaxScaler或 MaxAbsScaler,目的是针对标准差很小的特征或者想保留特征中的0(稀有矩阵)
X_train = np.array([[ 1., -1., 2.],
[ 2., 0., 0.],
[ 0., 1., -1.]])
min_max_scaler = preprocessing.MinMaxScaler()#MinMaxScaler(copy=True, feature_range=(0, 1))
X_train_minmax = min_max_scaler.fit_transform(X_train)#fit,trainsform2步合并成一步
X_train_minmax
>>>
array([[ 0.5 , 0. , 1. ], [ 1. , 0.5 , 0.33333333], [ 0. , 1. , 0. ]])
X_test = np.array([[ -3., -1., 4.]]) X_test_minmax = min_max_scaler.transform(X_test) X_test_minmax#array([[-1.5 , 0. , 1.66666667]])
min_max_scaler.scale_ #=1/(X_train.max(axis=0)-X_train.min(axis=0))array([ 0.5 , 0.5 , 0.33333333])
min_max_scaler.min_ #array([ 0. , 0.5 , 0.33333333])
X_train = np.array([[ 1., -1., 2.], [ 2., 0., 0.], [ 0., 1., -1.]]) max_abs_scaler = preprocessing.MaxAbsScaler()#将最大值转化为1的方式,转化因子1/X_train(axis=0) X_train_maxabs = max_abs_scaler.fit_transform(X_train) X_train_maxabs >>>array([[ 0.5, -1. , 1. ], [ 1. , 0. , 0. ], [ 0. , 1. , -0.5]])X_test = np.array([[ -3., -1., 4.]]) X_test_maxabs = max_abs_scaler.transform(X_test) X_test_maxabs #array([[-1.5, -1. , 2. ]])
max_abs_scaler.scale_ #array([ 2., 1., 2.])4.3.1.2. Scaling sparse data不解释 4.3.1.3. Scaling data with outliers#用robust_scale、RobustScaler 4.3.1.4. Centering kernel matrices不解释 # 4.3.2. Normalization归化 #用于文本分类或聚类分析,适用于稀疏矩阵 #preprocessing.normalize(X, norm='l2', axis=1, copy=True)是对默认列(axis=1)进行向量归一化
X_train = np.array([[ 1., -1., 2.], [ 2., 0., 0.], [ 0., 1., -1.]]) X_normalized = preprocessing.normalize(X, norm='l2') X_normalized
>>>array([[ 0.40824829, -0.40824829, 0.81649658], [ 1. , 0. , 0. ], [ 0. , 0.70710678, -0.70710678]])normalizer = preprocessing.Normalizer().fit(X) normalizer#Normalizer(copy=True, norm='l2') normalizer.transform(X)
>>>array([[ 0.40824829, -0.40824829, 0.81649658], [ 1. , 0. , 0. ], [ 0. , 0.70710678, -0.70710678]])normalizer.transform([[-1., 1., 0.]]) #array([[-0.70710678, 0.70710678, 0. ]])4.3.3. Binarization#通过设阈值转化为二项 X_train = np.array([[ 1., -1., 2.], [ 2., 0., 0.], [ 0., 1., -1.]]) binarizer = preprocessing.Binarizer().fit(X) binarizer#Binarizer(copy=True, threshold=0.0) binarizer.transform(X)>>>array([[ 1., 0., 1.], [ 1., 0., 0.], [ 0., 1., 0.]])binarizer = preprocessing.Binarizer(threshold=1.1)#设置阈值 binarizer.transform(X)>>>array([[ 0., 0., 1.], [ 1., 0., 0.], [ 0., 0., 0.]])4.3.4. Encoding categorical features #编码种类化特征 OneHotEncoderenc = preprocessing.OneHotEncoder()
enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
#对数据进行one-hot编码,比如第3列有0、1、2、3把他们编码成最后4列>>>OneHotEncoder(categorical_features='all', dtype=<class 'float'>, handle_unknown='error', n_values='auto', sparse=True)enc.transform([[0, 1, 3]]).toarray()#array([[ 1., 0., 0., 1., 0., 0., 0., 0., 1.]])enc = preprocessing.OneHotEncoder(n_values=[2, 3, 4])#对编码列数的要求enc.fit([[1, 2, 3], [0, 2, 0]])enc.transform([[1, 0, 0]]).toarray()#array([[ 0., 1., 1., 0., 0., 1., 0., 0., 0.]])
label encoding #来源于python real world machine learning
label_encoder=preprocessing.LabelEncoder() input_classes=['audi','ford','audi','toyota', 'ford','bmw'] label_encoder.fit(input_classes) print('\nCLass mapping:') for i,item in enumerate(label_encoder.classes_): print(item,'-->',i)
>>>
CLass mapping: audi --> 0 bmw --> 1 ford --> 2 toyota --> 3labels=['toyota','ford','audi'] encoded_labels = label_encoder.transform(labels) print("\nLabels =", labels) print("Encoded labels =", list(encoded_labels))
>>>
Labels = ['toyota', 'ford', 'audi'] Encoded labels = [3, 2, 0]encoded_labels = [2, 1, 0, 3, 1] decoded_labels = label_encoder.inverse_transform(encoded_labels)#还原解码 print("\nEncoded labels =", encoded_labels) print("Decoded labels =", list(decoded_labels))
Encoded labels = [2, 1, 0, 3, 1] Decoded labels = ['ford', 'bmw', 'audi', 'toyota', 'bmw']4.3.5. Imputation of missing values
#处理缺失值
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)#用均值填充
imp.fit([[1, 2], [np.nan, 3], [7, 6]])
Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)
X = [[np.nan, 2], [6, np.nan], [7, 6]] print(imp.transform(X))
>>>[[ 4. 2. ] [ 6. 3.66666667] [ 7. 6. ]]import scipy.sparse as spX = sp.csc_matrix([[1, 2], [0, 3], [7, 6]])imp = Imputer(missing_values=0, strategy='mean', axis=0)#稀疏矩阵的处理imp.fit(X)X_test = sp.csc_matrix([[0, 2], [6, 0], [7, 6]])print(imp.transform(X_test))
>>>
[[ 4. 2. ] [ 6. 3.66666675] [ 7. 6. ]]4.3.6. Generating polynomial features#生成多项式特征
import numpy as np from sklearn.preprocessing import PolynomialFeatures X = np.arange(6).reshape(3, 2) poly = PolynomialFeatures(2) poly.fit_transform(X) # from(X_1, X_2) to (1, X_1, X_2, X_1^2, X_1X_2, X_2^2).
array([[ 1., 0., 1., 0., 0., 1.], [ 1., 2., 3., 4., 6., 9.], [ 1., 4., 5., 16., 20., 25.]])X = np.arange(9).reshape(3, 3) poly = PolynomialFeatures(degree=3, interaction_only=True) poly.fit_transform(X)# from (X_1, X_2, X_3) to (1, X_1, X_2, X_3, X_1X_2, X_1X_3, X_2X_3, X_1X_2X_3)>>>array([[ 1., 0., 1., 2., 0., 0., 2., 0.], [ 1., 3., 4., 5., 12., 15., 20., 60.], [ 1., 6., 7., 8., 42., 48., 56., 336.]])4.3.7. Custom transformers
#定制转换函数import numpy as np from sklearn.preprocessing import FunctionTransformer transformer = FunctionTransformer(np.log1p) X = np.array([[0, 1], [2, 3]]) transformer.transform(X)>>>array([[ 0. , 0.69314718], [ 1.09861229, 1.38629436]])http://scikit-learn.org/stable/auto_examples/preprocessing/plot_function_transformer.html#sphx-glr-auto-examples-preprocessing-plot-function-transformer-pyUsing FunctionTransformer to select columnsimport matplotlib.pyplot as plt import numpy as np from sklearn.cross_validation import train_test_split from sklearn.decomposition import PCA from sklearn.pipeline import make_pipeline from sklearn.preprocessing import FunctionTransformerdef _generate_vector(shift=0.5, noise=15): return np.arange(1000) + (np.random.rand(1000) - shift) * noise def generate_dataset(): """ This dataset is two lines with a slope ~ 1, where one has a y offset of ~100 """ return np.vstack((#vstack,vertical stack行堆积 np.vstack(( _generate_vector(), _generate_vector() + 100, )).T, np.vstack(( _generate_vector(), _generate_vector(), )).T, )), np.hstack((np.zeros(1000), np.ones(1000)))#按列推积 def all_but_first_column(X): return X[:, 1:] def drop_first_component(X, y): """ Create a pipeline with PCA and the column selector and use it to transform the dataset. """ pipeline = make_pipeline( PCA(), FunctionTransformer(all_but_first_column), ) X_train, X_test, y_train, y_test = train_test_split(X, y) pipeline.fit(X_train, y_train) return pipeline.transform(X_test), y_testif __name__ == '__main__': X, y = generate_dataset() plt.figure(figsize=(6,4)) plt.scatter(X[:, 0], X[:, 1], c=y, s=30) plt.show() X_transformed, y_transformed = drop_first_component(*generate_dataset()) plt.scatter( X_transformed[:, 0], np.zeros(len(X_transformed)), c=y_transformed, s=50, ) plt.show()