线性回归
一,概念
线性回归模型的通俗解释和数学应用的。线性回归是一种机器学习技术,用于预测标签(银行会贷款给我多少钱)和偏置项(真实值和预测值之间差异)之间的关系。它考虑了特征(年龄和工资)对最终银行贷款的影响,并计算出每个特征各自对贷款额度的影响。具体来说,它预测了工资和年龄分别对银行贷款的影响程度,从而帮助人们更好地了解如何预测银行会贷款给我多少钱。
梯度下降:
- 引入:当我们得到了一个目标函数后,如何进行求解?直接求解?(并不一定可解,线性回归可以当做是一个特例)
- 常规套路:机器学习的套路就是我们交给机器一堆数据,然后告诉它什么样的学习方式是对的(目标函数),然后让它朝着这个方向去做
- 如何优化:一口吃不成一个胖子,我们要一步步的完成迭代
目标函数:
- 批量梯度下降:容易得到最优解,但是由于每次考虑所有样本,速度很慢。
- 随机梯度下降(SGD):每次找一个样本,迭代速度快,但不一定每次都朝着收敛的方向
- 小批量梯度下降法:每次更新选择一小部分数据来算
二,实战一(披萨预测)
import matplotlib.pyplot as plt
from pylab import mpl
# 设置显示中文字体
mpl.rcParams["font.sans-serif"] = ["SimHei"]
# 设置正常显示符号
mpl.rcParams["axes.unicode_minus"] = False
from sklearn.linear_model import LinearRegression
X_train = [[6], [8], [10], [14], [18]] # 披萨直径
y_train = [[7], [9], [13], [17.5], [18]] # 披萨价格
plt.figure()
plt.plot(X_train,y_train,'r*',markersize=10)
plt.title('披萨直径与价格的散点图')
plt.ylabel('直径(英寸)')
plt.ylabel ('价格(美元)')
plt.axis([0,25,0,25])
plt.grid(True)
plt.savefig('scatter_data.png')
plt.show()
model = LinearRegression()
model.fit(X_train,y_train)
y_train_pred = model.predict(X_train)
print('模型的表达式为:y=%0.3f * x + %0.3f' % (model.coef_[0][0],model.intercept_[0]))
# 模型表达式为:y = 0.976 * x + 1.966
y_train_pred
plt.figure()
plt.plot(X_train,y_train,'r*',markersize=10)
plt.title('披萨直径与价格的散点图')
plt.ylabel('直径(英寸)')
plt.ylabel ('价格(美元)')
plt.axis([0,25,0,25])
plt.grid(True)
plt.savefig('scatter_data.png')
plt.show()
while True:
x_pre=input("请输入单个披萨的直径(输入q退出):")
if x_pre=='q':
break
else:
x_prel=[[float(x_pre)]]
y_pre=model.predict(x_prel)
print('预测{0}英寸披萨价格为:${1:.2f}'.format(x_pre,y_pre[0][0]))
print ('\n')
结果之一如下:(可以用jupyter或pycharm跑跑结果)
三,实战二
1.数据预处理:
- prepare_for_training.py
"""Prepares the dataset for training"""
import numpy as np
from .normalize import normalize
from .generate_sinusoids import generate_sinusoids
from .generate_polynomials import generate_polynomials
def prepare_for_training(data, polynomial_degree=0, sinusoid_degree=0, normalize_data=True):
# 计算样本总数
num_examples = data.shape[0]
data_processed = np.copy(data)
# 预处理
features_mean = 0
features_deviation = 0
data_normalized = data_processed
if normalize_data:
# 对数据进行归一化处理
(
data_normalized,
features_mean,
features_deviation
) = normalize(data_processed)
data_processed = data_normalized
# 特征变换sinusoidal
if sinusoid_degree > 0:
# 生成正弦波特征
sinusoids = generate_sinusoids(data_normalized, sinusoid_degree)
# 将正弦波特征添加到数据中
data_processed = np.concatenate((data_processed, sinusoids), axis=1)
# 特征变换polynomial
if polynomial_degree > 0:
# 生成多项式特征
polynomials = generate_polynomials(data_normalized, polynomial_degree, normalize_data)
# 将多项式特征添加到数据中
data_processed = np.concatenate((data_processed, polynomials), axis=1)
# 加一列1,用于偏置项的计算
data_processed = np.hstack((np.ones((num_examples, 1)), data_processed))
return data_processed, features_mean, features_deviation
2.线性回归模型
- linear_regression.py
import numpy as np
from utils.features import prepare_for_training
class LinearRegression:
def __init__(self, data, labels, polynomial_degree=0, sinusoid_degree=0, normalize_data=True):
"""
1.对数据进行预处理操作
2.先得到所有的特征个数
3.初始化参数矩阵
"""
# 对数据进行预处理,包括多项式和正弦函数转换,以及归一化处理
(data_processed,
features_mean,
features_deviation) = prepare_for_training(data, polynomial_degree, sinusoid_degree, normalize_data=True)
self.data = data_processed # 处理后的数据
self.labels = labels # 标签数据
self.features_mean = features_mean # 特征均值
self.features_deviation = features_deviation # 特征标准差
self.polynomial_degree = polynomial_degree # 多项式次数
self.sinusoid_degree = sinusoid_degree # 正弦函数次数
self.normalize_data = normalize_data # 是否进行归一化处理
num_features = self.data.shape[1] # 获取特征个数
self.theta = np.zeros((num_features, 1)) # 初始化参数矩阵
def train(self, alpha, num_iterations=500):
"""
训练模块,执行梯度下降
"""
cost_history = self.gradient_descent(alpha, num_iterations) # 执行梯度下降并返回损失历史记录
return self.theta, cost_history # 返回参数矩阵和损失历史记录
def gradient_descent(self, alpha, num_iterations):
"""
实际迭代模块,会迭代num_iterations次
"""
cost_history = [] # 初始化损失历史记录列表
for _ in range(num_iterations): # 迭代num_iterations次
self.gradient_step(alpha) # 执行梯度下降一步
cost_history.append(self.cost_function(self.data, self.labels)) # 计算当前损失并添加到损失历史记录列表中
return cost_history # 返回损失历史记录列表
def gradient_step(self, alpha):
"""
梯度下降参数更新计算方法,注意是矩阵运算
"""
num_examples = self.data.shape[0] # 获取样本数量
prediction = LinearRegression.hypothesis(self.data, self.theta) # 计算预测值
delta = prediction - self.labels # 计算预测值与真实值的差值
theta = self.theta # 获取当前参数矩阵
theta = theta - alpha * (1 / num_examples) * (np.dot(delta.T, self.data)).T # 更新参数矩阵
self.theta = theta # 更新类的属性中的参数矩阵
def cost_function(self, data, labels):
"""
损失计算方法
"""
num_examples = data.shape[0] # 获取样本数量
delta = LinearRegression.hypothesis(self.data, self.theta) - labels # 计算预测值与真实值的差值
cost = (1 / 2) * np.dot(delta.T, delta) / num_examples # 计算损失值
return cost[0][0] # 返回损失值
@staticmethod
def hypothesis(data, theta):
"""
根据参数矩阵和输入数据计算预测值
"""
predictions = np.dot(data, theta) # 计算预测值
return predictions # 返回预测值
def get_cost(self, data, labels):
"""
使用训练好的参数模型,计算给定数据的预测损失值
"""
data_processed = prepare_for_training(data,
self.polynomial_degree,
self.sinusoid_degree,
self.normalize_data
)[0] # 对输入数据进行预处理
return self.cost_function(data_processed, labels) # 计算预测损失值并返回
def predict(self, data):
"""
用训练的参数模型,对输入数据进行预测,得到回归值结果
"""
data_processed = prepare_for_training(data,
self.polynomial_degree,
self.sinusoid_degree,
self.normalize_data
)[0] # 对输入数据进行预处理
predictions = LinearRegression.hypothesis(data_processed, self.theta) # 计算预测值
return predictions # 返回预测值
3.读取数据,可视化
- Achieve.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from linear_regression import LinearRegression
matplotlib.use('TkAgg')
data = pd.read_csv('./data/world-happiness-report-2017.csv')
# 得到训练和测试数据
train_data = data.sample(frac=0.8)
test_data = data.drop(train_data.index)
# 输入特征名字
input_param_name = 'Economy..GDP.per.Capita.'
# 输出特征名字
output_param_name = 'Happiness.Score'
x_train = train_data[[input_param_name]].values
y_train = train_data[[output_param_name]].values
x_test = test_data[input_param_name].values
y_test = test_data[output_param_name].values
plt.scatter(x_train, y_train, label='Train data')
plt.scatter(x_test, y_test, label='test data')
plt.xlabel(input_param_name)
plt.ylabel(output_param_name)
plt.title('Happy')
plt.legend()
plt.show()
# 训练次数
num_iterations = 500
# 学习率
learning_rate = 0.01
linear_regression = LinearRegression(x_train, y_train)
(theta, cost_history) = linear_regression.train(learning_rate, num_iterations)
print('开始时的损失:', cost_history[0])
print('训练后的损失:', cost_history[-1])
plt.plot(range(num_iterations), cost_history)
plt.xlabel('Iter')
plt.ylabel('cost')
plt.title('GD')
plt.show()
predictions_num = 100
x_predictions = np.linspace(x_train.min(), x_train.max(), predictions_num).reshape(predictions_num, 1)
y_predictions = linear_regression.predict(x_predictions)
plt.scatter(x_train, y_train, label='Train data')
plt.scatter(x_test, y_test, label='test data')
plt.plot(x_predictions, y_predictions, 'r', label='Prediction')
plt.xlabel(input_param_name)
plt.ylabel(output_param_name)
plt.title('Happy')
plt.legend()
plt.show()
4.归一化操作
- normalize.py
"""Normalize features"""
import numpy as np
def normalize(features):
features_normalized = np.copy(features).astype(float)
# 计算均值
features_mean = np.mean(features, 0)
# 计算标准差
features_deviation = np.std(features, 0)
# 标准化操作
if features.shape[0] > 1:
features_normalized -= features_mean
# 防止除以0
features_deviation[features_deviation == 0] = 1
features_normalized /= features_deviation
return features_normalized, features_mean, features_deviation
实验结果: