练习:最小值/最大值重缩放器编码
import numpy as np
def featureScaling(arr):
arr = np.array(arr)
max = np.max(arr)
min = np.min(arr)
res = []
for item in arr:
data = float(item-min)/(max-min)
res.append(data)
return res
# tests of your feature scaler--line below is input data
data = [115, 140, 175]
print featureScaling(data)
练习:需要重缩放的算法练习
- 使用 RBF 核函数的 SVN
- K-均值聚类
练习:缩放类型
- MinMaxScaler
练习:计算重缩放特征
- salary : 0.17962406631
- stock : 0.0290205889347
import numpy as np
stocklist = []
for item in data_dict:
stock = data_dict[item]['exercised_stock_options']
if stock != 'NaN':
stocklist.append( stock )
stocklist = np.array(stocklist)
print (1000000.0 - np.min(stocklist)) / (np.max(stocklist) - np.min(stocklist))
salarylist = []
for item in data_dict:
salary = data_dict[item]['salary']
if salary != 'NaN':
salarylist.append( salary )
print (200000.0 - np.min(salarylist)) / (np.max(salarylist) - np.min(salarylist))
很奇怪的是,使用sklearn里面的MinMaxScaler的缩放率计算有误差,代码如下
import numpy as np
stocklist = []
for item in data_dict:
stock = data_dict[item]['exercised_stock_options']
if stock != 'NaN':
stocklist.append( stock )
stocklist = np.array(stocklist)
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
min_max_scaler.fit_transform(stocklist)
print 1000000 * min_max_scaler.scale_
salarylist = []
for item in data_dict:
salary = data_dict[item]['salary']
if salary != 'NaN':
salarylist.append( salary )
salarylist = np.array(salarylist)
min_max_scaler.fit_transform(salarylist)
print 200000 * min_max_scaler.scale_
print np.max(salarylist)
print np.min(salarylist)
练习:何时部署特征缩放
- 重要