Scale Machine Learning Data

Chapter 2

Scale Machine Learning Data

标准化数据

得到最大值和最小值
# Find the min and max values for each column 
def dataset_minmax(dataset): 
	minmax = list() 
	for i in range(len(dataset[0])):
 		 col_values = [row[i] for row in dataset] 
		 value_min = min(col_values) 
		 value_max = max(col_values)
 		 minmax.append([value_min, value_max]) 
  return minmax

公式:
scaled value =value−min/ max−min

# Rescale dataset columns to the range 0-1 
def normalize_dataset(dataset, minmax):
 for row in dataset: 
 	for i in range(len(row)): 
 	row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0]
综上
# Example of normalizing the diabetes dataset 
from csv import reader

# Load a CSV file
 def load_csv(filename): 
	 dataset = list() 
	 with open(filename, 'r') as file: 
	 csv_reader = reader(file) 
 	for row in csv_reader:
  		if not row: 
 	 continue 
 	 dataset.append(row) 
  return dataset
  
# Convert string column to float 
def str_column_to_float(dataset, column): 
	for row in dataset: 
	row[column] = float(row[column].strip())

# Find the min and max values for each column 
def dataset_minmax(dataset): 
	minmax = list() 
	for i in range(len(dataset[0])):
		 col_values = [row[i] 
 	for row in dataset] 
		 value_min = min(col_values) 
 		value_max = max(col_values) 
		 minmax.append([value_min, value_max]) 
	 return minmax
 
# Rescale dataset columns to the range 0-1 
def normalize_dataset(dataset, minmax): 
	for row in dataset: 
		for i in range(len(row)): 
			row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])

# Load pima-indians-diabetes dataset 
filename = 'pima-indians-diabetes.csv' 
dataset = load_csv(filename) 
print('Loaded data file {0} with {1} rows and {2} columns'.format(filename, len(dataset), len(dataset[0]))) 

# convert string columns to float 
for i in range(len(dataset[0])): 
	str_column_to_float(dataset, i) 
print(dataset[0])
# Calculate min and max for each column 
minmax = dataset_minmax(dataset)
 # Normalize columns 
normalize_dataset(dataset, minmax) 
print(dataset[0])

结果:

Loaded data file pima-indians-diabetes.csv with 768 rows and 9 columns 
[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0] 
[0.35294117647058826, 0.7437185929648241, 0.5901639344262295, 0.35353535353535354, 0.0, 0.5007451564828614, 0.23441502988898377, 0.48333333333333334, 1.0]

标准化

均值
公式

# calculate column means 
def column_means(dataset): 
	means = [0 for i in range(len(dataset[0]))] 
	for i in range(len(dataset[0])):
		col_values = [row[i] for row in dataset] 
		means[i] = sum(col_values) / float(len(dataset)) 
	return means

方差
在这里插入图片描述

# calculate column standard deviations
 def column_stdevs(dataset, means): 
	 stdevs = [0 for i in range(len(dataset[0]))]
	  for i in range(len(dataset[0])): 
 	 	 variance = [pow(row[i]-means[i], 2) for row in dataset] 
 	 	 stdevs[i] = sum(variance) 
 	 stdevs = [sqrt(x/(float(len(dataset)-1))) for x in stdevs] 
 	 return stdevs

标准化
在这里插入图片描述

# standardize dataset 
def standardize_dataset(dataset, means, stdevs): 
	for row in dataset: 
		for i in range(len(row)): 
			row[i] = (row[i] - means[i]) / stdevs[i]

标准化是一种缩放技术,它假设数据符合正态分布。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值