import numpy as np
from sklearn import preprocessing
data = np.array([[3, -1.5, 2, -5.4], [0, 4, -0.3, 2.1], [1, 3.3,-1.9, -4.3]])
#1均值移除
data_standardized = preprocessing.scale(data)
print("\nMean =", data_standardized.mean(axis=0))
print("Std deviation =", data_standardized.std(axis=0))
#2范围缩放
data_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1 ))
data_scaled = data_scaler.fit_transform(data)
print("\nMin max scaled data =", data_scaled)
#3 归一化-每个特征向量的值都缩放到相同的数值范围
data_normalized = preprocessing.normalize(data, norm='l1')
print("\nL1 normalized data =", data_normalized)
#4二值化-只有0和1
data_binarized = preprocessing.Binarizer(threshold=1.4).transform(data)
print("\nBinarized data =", data_binarized)
#5 独热编码-看作是一种收紧(tighten)特征向量的工具
encoder = preprocessing.OneHotEncoder()
encoder.fit([[0, 2, 1, 12], [1 , 3, 5, 3], [2, 3, 2, 12], [1 , 2, 4, 3]])
encoded_vector = encoder.transform([[2, 3, 5, 3]]).toarray()
print("\nEncoded vector =", encoded_vector)
运行结果
Mean = [ 5.55111512e-17 -1.11022302e-16 -7.40148683e-17 -7.40148683e-17]
Std deviation = [1. 1. 1. 1.]
Min max scaled data = [[1. 0. 1. 0. ]
[0. 1. 0.41025641 1. ]
[0.33333333 0.87272727 0. 0.14666667]]
L1 normalized data = [[ 0.25210084 -0.12605042 0.16806723 -0.45378151]
[ 0. 0.625 -0.046875 0.328125 ]
[ 0.0952381 0.31428571 -0.18095238 -0.40952381]]
Binarized data = [[1. 0. 1. 0.]
[0. 1. 0. 1.]
[0. 1. 0. 0.]]
Encoded vector = [[0. 0. 1. 0. 1. 0. 0. 0. 1. 1. 0.]]