Chapter 2
Scale Machine Learning Data
标准化数据
得到最大值和最小值
# Find the min and max values for each column
def dataset_minmax(dataset):
minmax = list()
for i in range(len(dataset[0])):
col_values = [row[i] for row in dataset]
value_min = min(col_values)
value_max = max(col_values)
minmax.append([value_min, value_max])
return minmax
公式:
# Rescale dataset columns to the range 0-1
def normalize_dataset(dataset, minmax):
for row in dataset:
for i in range(len(row)):
row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0]
综上
# Example of normalizing the diabetes dataset
from csv import reader
# Load a CSV file
def load_csv(filename):
dataset = list()
with open(filename, 'r') as file:
csv_reader = reader(file)
for row in csv_reader:
if not row:
continue
dataset.append(row)
return dataset
# Convert string column to float
def str_column_to_float(dataset, column):
for row in dataset:
row[column] = float(row[column].strip())
# Find the min and max values for each column
def dataset_minmax(dataset):
minmax = list()
for i in range(len(dataset[0])):
col_values = [row[i]
for row in dataset]
value_min = min(col_values)
value_max = max(col_values)
minmax.append([value_min, value_max])
return minmax
# Rescale dataset columns to the range 0-1
def normalize_dataset(dataset, minmax):
for row in dataset:
for i in range(len(row)):
row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])
# Load pima-indians-diabetes dataset
filename = 'pima-indians-diabetes.csv'
dataset = load_csv(filename)
print('Loaded data file {0} with {1} rows and {2} columns'.format(filename, len(dataset), len(dataset[0])))
# convert string columns to float
for i in range(len(dataset[0])):
str_column_to_float(dataset, i)
print(dataset[0])
# Calculate min and max for each column
minmax = dataset_minmax(dataset)
# Normalize columns
normalize_dataset(dataset, minmax)
print(dataset[0])
结果:
Loaded data file pima-indians-diabetes.csv with 768 rows and 9 columns
[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0]
[0.35294117647058826, 0.7437185929648241, 0.5901639344262295, 0.35353535353535354, 0.0, 0.5007451564828614, 0.23441502988898377, 0.48333333333333334, 1.0]
标准化
均值
# calculate column means
def column_means(dataset):
means = [0 for i in range(len(dataset[0]))]
for i in range(len(dataset[0])):
col_values = [row[i] for row in dataset]
means[i] = sum(col_values) / float(len(dataset))
return means
方差
# calculate column standard deviations
def column_stdevs(dataset, means):
stdevs = [0 for i in range(len(dataset[0]))]
for i in range(len(dataset[0])):
variance = [pow(row[i]-means[i], 2) for row in dataset]
stdevs[i] = sum(variance)
stdevs = [sqrt(x/(float(len(dataset)-1))) for x in stdevs]
return stdevs
标准化
# standardize dataset
def standardize_dataset(dataset, means, stdevs):
for row in dataset:
for i in range(len(row)):
row[i] = (row[i] - means[i]) / stdevs[i]
标准化是一种缩放技术,它假设数据符合正态分布。