前言
上一篇我们已经(非常)简单的介绍过线性回归了,但是没有做到代码规范、数据预处理、学习率的选择、划分训练集和测试集、正则化和模型评估等等流程。
本篇来解决数据预处理和代码规范问题
数据集
选取世界幸福指数分析报告, world-happiness-report-2017.csv中的Economy…GDP.per.Capita. 、Health…Life.Expectancy.作为模型参数输入,Happiness.Score作为结果输出
数据预处理 - 标准化
(data - 均值)/方差
def normalize(features):
features_normalized = np.copy(features).astype(float)
# 计算均值
features_mean = np.mean(features, 0)
# 计算标准差
features_deviation = np.std(features, 0)
# 标准化操作
if features.shape[0] > 1:
features_normalized -= features_mean
# 防止除以0
features_deviation[features_deviation == 0] = 1
features_normalized /= features_deviation
return features_normalized, features_mean, features_deviation
线性回归
import numpy as np
from BaseLinearRegression.utils.features import normalize
class LinearRegression:
def __init__(self,data,labels,normalize_data=True):
# 计算样本总数
num_examples = data.shape[0]
data_processed = np.copy(data)
self.normalize_data = normalize_data
# 预处理
data_normalized = data_processed
if normalize_data:
(
data_normalized,
features_mean,
features_deviation
) = normalize(data_processed)
data_processed = data_normalized
# 加一列1
data_processed = np.hstack((np.ones((num_examples, 1)), data_processed))
self.data = data_processed
self.labels = labels
num_features = self.data.shape[1] # 特征个数
self.theta = np.zeros((num_features,1))
def train(self,alpha,num_iterations = 500):
loss_history = self.gradient_descent(alpha,num_iterations)
return self.theta,loss_history
def gradient_descent(self,alpha,num_iterations):
loss_history = []
for _ in range(num_iterations):
self.gradient_step(alpha)
loss_history.append(self.loss_function(self.data,self.labels))
return loss_history
def gradient_step(self, alpha):
num_examples = self.data.shape[0] # 一组多少个
prediction = LinearRegression.hypothesis(self.data,self.theta)
delta = prediction - self.labels
theta = self.theta
self.theta = theta - alpha*(1/num_examples)*(np.dot(delta.T,self.data)).T
@staticmethod
def hypothesis(data, theta):
return np.dot(data, theta)
def loss_function(self,data,labels):
"""
损失计算
:param data: 数据
:param labels: 标签
:return:
"""
num_examples = self.data.shape[0] # 一组多少个
delta = LinearRegression.hypothesis(self.data,self.theta) - labels
cost = (1/2)*np.dot(delta.T,delta)/num_examples
return cost[0][0]
def get_cost(self,data,labels):
# data_processed = prepare_for_training(data,self.polynomial_degree,self.sinusoid_degree,self.normalize_data)[0]
if self.normalize_data:
(
data_normalized,
features_mean,
features_deviation
) = normalize(data)
data_processed = data_normalized
# 加一列1
data_processed = np.hstack((np.ones((data.shape[0], 1)), data_processed))
return self.loss_function(data_processed,labels)
def predict(self,data):
"""
预测模块
:param data: 数据
:return:
"""
if self.normalize_data:
(
data_normalized,
features_mean,
features_deviation
) = normalize(data)
data_processed = data_normalized
# 加一列1
data_processed = np.hstack((np.ones((data.shape[0], 1)), data_processed))
prediction = LinearRegression.hypothesis(data_processed,self.theta)
return prediction
标准化预处理后结果对比
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from BaseLinearRegression.SimpleLinearRegression import LinearRegression
data = pd.read_csv('./data/world-happiness-report-2017.csv')
# 得到训练和测试数据
train_data = data.sample(frac = 0.8)
test_data = data.drop(train_data.index)
input_param_name = ['Economy..GDP.per.Capita.','Health..Life.Expectancy.']
output_param_name = 'Happiness.Score'
x_train = train_data[input_param_name].values
y_train = train_data[[output_param_name]].values
x_test = test_data[input_param_name].values
y_test = test_data[output_param_name].values
num_iterations = 500
learning_rate = 0.01
linear_regression = LinearRegression(x_train,y_train)
(theta,cost_history) = linear_regression.train(learning_rate,num_iterations)
linear_regression_no_standar = LinearRegression(x_train,y_train,normalize_data=False)
(theta_no_standar,cost_history_no_standar) = linear_regression_no_standar.train(learning_rate,num_iterations)
plt.plot(range(num_iterations),cost_history,'r')
plt.plot(range(num_iterations),cost_history_no_standar,'g')
plt.xlabel('Iter')
plt.ylabel('cost')
plt.title('GD')
plt.show()
predictions_num = 100
x_predictions = np.linspace(x_train.min(),x_train.max(),predictions_num).reshape(predictions_num,1)
y_predictions = linear_regression.predict(x_predictions)
相同参数的情况下进行预处理的数据loss下降的更平缓,没有进行预处理的loss下降更陡峭。