#!/usr/bin/env python2
#-*- coding: utf-8 -*-
import numpy as np
from sklearn import preprocessing
data = np.array([[3, -1.5, 2, -5.4],
[0, 4, -0.3, 2.1],
[1, 3.3, -1.9, -4.3]])
#mean removal
#标准正态分布
#x* = ( x - u(axis=0) ) / std(axis=0)
data_standardized = preprocessing.scale(data)
print "zero mean norm: \n", data_standardized
print "Mean: %s" % data_standardized.mean(axis = 0)
print "standard: %s" % data_standardized.std(axis = 0)
#scaling
#最大最小值归一化
#feature_range=(min, max),默认:feature_range=(0, 1)
#X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
#X_scaled = X_std * (max - min) + min
data_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
data_scaled = data_scaler.fit_transform(data)
print "Max min scaled data: \n", data_scaled
#normalization
#注意:sklearn提供的归一化函数normalize默认axis=1,即对每一行的所有元素执行规一化操作!!!
#如果需要对列操作,需要指定axis=0
data_norm_l1 = preprocessing.normalize(data,norm='l1')
print "L1 normed data: \n", data_norm_l1
data_norm_l2 = preprocessing.normalize(data,norm='l2',axis=0)
print "L2 normed data: \n", data_norm_l2
#binarization
#通过与指定阈值比较大小,将数值转换为bool值
data_binarized = preprocessing.Binarizer(threshold=1.4).transform(data)
print "Binarized data: \n", data_binarized
#one hot coding
#独热编码
encoder = preprocessing.OneHotEncoder()
encoder.fit([[0, 2, 1, 12],
[1, 3, 5, 3],
[2, 3, 2, 12],
[1, 2, 4, 3]])
#按列统计包含的值不同的元素个数
#第一列包含的不同值元素统计结果:[0, 1, 2] #3个元素
#第二列包含的不同值元素统计结果:[2, 3] #2个元素
#第三列包含的不同值元素统计结果:[1, 2, 4, 5] #4个元素
#第四列包含的不同值元素统计结果:[3, 12] #2个元素
#因此,当输出入一个向量进行独热编码时,其对应的独热编码长度为
#11 = 3 + 2 + 4 + 2, 对应的独热编码为元素个数为11的向量:
# [0, 0, 0,| 0, 0,| 0, 0, 0, 0,| 0, 0]
# | | |
# [0, 1, 2,| 2, 3,| 1, 2, 4, 5,| 3, 12]
#对于输入向量[2, 3, 5, 3]: ==>
# [0, 0, 1,| 0, 1,| 0, 0, 0, 1,| 1, 0]
encoder_vector = encoder.transform([[2, 3, 5, 3]]).toarray()
print "Encoded vector: \n", encoder_vector
#label encoder
#标记编码器
label_encoder = preprocessing.LabelEncoder()
input_classes = ['audi', 'ford', 'audi', 'toyota', 'ford', 'bmw']
label_encoder.fit(input_classes)
print 'Class mapping:'
for i, item in enumerate(label_encoder.classes_):
print item, '-->', i
labels = ['toyota', 'ford', 'audi']
print 'Labels: ', labels
#编码
encoded_labels = label_encoder.transform(labels)
print 'Encoded labels: ', encoded_labels
#解码
decoded_labels = label_encoder.inverse_transform(encoded_labels)
print 'Decoded labels: ', decoded_labels