# 数据预处理系列：（三）把数据调整为标准正态分布

## 把数据调整为标准正态分布

In [1]:
from sklearn import preprocessing
import numpy as np


### How to do it...

In [2]:
from sklearn import datasets
X, y = boston.data, boston.target

In [5]:
X[:, :3].mean(axis=0) #前三个特征的均值

Out[5]:
array([  3.59376071,  11.36363636,  11.13677866])
In [6]:
X[:, :3].std(axis=0) #前三个特征的标准差

Out[6]:
array([  8.58828355,  23.29939569,   6.85357058])



In [7]:
X_2 = preprocessing.scale(X[:, :3])

In [8]:
X_2.mean(axis=0)

Out[8]:
array([  6.34099712e-17,  -6.34319123e-16,  -2.68291099e-15])
In [9]:
X_2.std(axis=0)

Out[9]:
array([ 1.,  1.,  1.])



### How it works...

$x=\frac{x-\bar{x}}{\sigma }$

In [10]:
my_scaler = preprocessing.StandardScaler()
my_scaler.fit(X[:, :3])
my_scaler.transform(X[:, :3]).mean(axis=0)

Out[10]:
array([  6.34099712e-17,  -6.34319123e-16,  -2.68291099e-15])

In [14]:
my_minmax_scaler = preprocessing.MinMaxScaler()
my_minmax_scaler.fit(X[:, :3])
my_minmax_scaler.transform(X[:, :3]).max(axis=0)

Out[14]:
array([ 1.,  1.,  1.])

In [19]:
my_odd_scaler = preprocessing.MinMaxScaler(feature_range=(-3.14, 3.14))
my_odd_scaler.fit(X[:, :3])
my_odd_scaler.transform(X[:, :3]).max(axis=0)

Out[19]:
array([ 3.14,  3.14,  3.14])

In [27]:
normalized_X = preprocessing.normalize(X[:, :3])


### There's more...

#### 创建幂等标准化（idempotent scaler）对象

In [37]:
my_useless_scaler = preprocessing.StandardScaler(with_mean=False, with_std=False)
transformed_sd = my_useless_scaler.fit_transform(X[:, :3]).std(axis=0)
original_sd = X[:, :3].std(axis=0)
np.array_equal(transformed_sd, original_sd)

Out[37]:
True



#### 处理稀疏数据填补

In [45]:
import scipy
matrix = scipy.sparse.eye(1000)
preprocessing.scale(matrix)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-45-466df6030461> in <module>()
1 import scipy
2 matrix = scipy.sparse.eye(1000)
----> 3 preprocessing.scale(matrix)

d:\programfiles\Miniconda3\lib\site-packages\sklearn\preprocessing\data.py in scale(X, axis, with_mean, with_std, copy)
120         if with_mean:
121             raise ValueError(
--> 122                 "Cannot center sparse matrices: pass with_mean=False instead"
123                 " See docstring for motivation and alternatives.")
124         if axis != 0:

ValueError: Cannot center sparse matrices: pass with_mean=False instead See docstring for motivation and alternatives.、

In [58]:
preprocessing.scale(matrix, with_mean=False)

Out[58]:
<1000x1000 sparse matrix of type '<class 'numpy.float64'>'
with 1000 stored elements in Compressed Sparse Row format>

• 点赞 2
• 评论 9
• 分享
x

海报分享

扫一扫，分享海报

• 收藏 10
• 手机看

分享到微信朋友圈

x

扫一扫，手机阅读

• 打赏

打赏

风雪夜归子

你的鼓励将是我创作的最大动力

C币 余额
2C币 4C币 6C币 10C币 20C币 50C币
• 一键三连

点赞Mark关注该博主, 随时了解TA的最新博文
11-22 1万+

12-18 2万+
07-19 4144
02-25 854
06-23 6万+
09-11
04-24
12-03
08-17 4万+