机器学习:Chap1 Regression

梯度下降法

梯度下降法——一元线性回归

import numpy as np
import matplotlib.pyplot as plt

# 载入数据
data = np.genfromtxt("data.csv", delimiter=",")
# 切分数据
x_data = data[:,0]
y_data = data[:,1]
# 绘制散点图
plt.scatter(x_data,y_data)
plt.show()

# 建模
# 学习率learning rate
lr = 0.0001
# 截距
b = 0 
# 斜率
k = 0 
# 最大迭代次数
epochs = 50

# 最小二乘法-定义代价函数
def compute_error(b, k, x_data, y_data):
    totalError = 0
    for i in range(0, len(x_data)):
        totalError += (y_data[i] - (k * x_data[i] + b)) ** 2
    return totalError/float(len(x_data))/2.0

# 梯度下降
def gradient_descent_runner(x_data, y_data, b, k, lr, epochs):
    # 计算总数据量
    m = float(len(x_data))
    # 循环epochs次
    for i in range(epochs):
        b_grad = 0
        k_grad = 0
        # 计算梯度的总和再求平均
        for j in range(0, len(x_data)):
            b_grad += (1/m) * (((k * x_data[j]) + b) - y_data[j])
            k_grad += (1/m) * x_data[j] * (((k * x_data[j]) + b) - y_data[j])
        # 更新b和k
        b = b - (lr * b_grad)
        k = k - (lr * k_grad)
        # 每迭代5次,输出一次图像
        if i % 5==0:
            print("epochs:",i)
            plt.plot(x_data, y_data, 'b.')
            plt.plot(x_data, k*x_data + b, 'r')
            plt.show()
    return b, k

# 计算
print("Starting b = {0}, k = {1}, error = {2}".format(b, k, compute_error(b, k, x_data, y_data)))
print("Running...")
b, k = gradient_descent_runner(x_data, y_data, b, k, lr, epochs)
print("After {0} iterations b = {1}, k = {2}, error = {3}".format(epochs, b, k, compute_error(b, k, x_data, y_data)))

# 画图(最后结果)
plt.plot(x_data, y_data, 'b.')
plt.plot(x_data, k*x_data + b, 'r')
plt.show()

梯度下降法——多元线性回归

import numpy as np
import matplotlib.pyplot as plt

# 读入数据 
data = np.genfromtxt("Delivery.csv",delimiter=',')
print(data)
# 切分数据
x_data = data[:,:-1] 
y_data = data[:,-1]
print(x_data)
print(y_data)

# 建模
# 学习率learning rate
lr = 0.0001
# 参数
theta0 = 0
theta1 = 0
theta2 = 0
# 最大迭代次数
epochs = 1000

# 计算误差
def compute_error(theta0, theta1, theta2, x_data, y_data):
    totalError = 0
    for i in range(0, len(x_data)):
        totalError += (y_data[i] - (theta1 * x_data[i,0] + theta2*x_data[i,1] + theta0)) ** 2
    return totalError / float(len(x_data))
    
# 梯度下降
def gradient_descent_runner(x_data, y_data, theta0, theta1, theta2, lr, epochs):
    # 计算总数据量
    m = float(len(x_data))
    # 循环epochs次
    for i in range(epochs):
        theta0_grad = 0
        theta1_grad = 0
        theta2_grad = 0
        # 计算梯度的总和再求平均
        for j in range(0, len(x_data)):
            theta0_grad += (1/m) * ((theta1 * x_data[j,0] + theta2*x_data[j,1] + theta0) - y_data[j])
            theta1_grad += (1/m) * x_data[j,0] * ((theta1 * x_data[j,0] + theta2*x_data[j,1] + theta0) - y_data[j])
            theta2_grad += (1/m) * x_data[j,1] * ((theta1 * x_data[j,0] + theta2*x_data[j,1] + theta0) - y_data[j])
        # 更新b和k
        theta0 = theta0 - (lr*theta0_grad)
        theta1 = theta1 - (lr*theta1_grad)
        theta2 = theta2 - (lr*theta2_grad)
    return theta0, theta1, theta2

# 计算
print("Starting theta0 = {0}, theta1 = {1}, theta2 = {2}, error = {3}".
      format(theta0, theta1, theta2, compute_error(theta0, theta1, theta2, x_data, y_data)))
print("Running...")
theta0, theta1, theta2 = gradient_descent_runner(x_data, y_data, theta0, theta1, theta2, lr, epochs)
print("After {0} iterations theta0 = {1}, theta1 = {2}, theta2 = {3}, error = {4}".
      format(epochs, theta0, theta1, theta2, compute_error(theta0, theta1, theta2, x_data, y_data)))

sklearn

sklearn——一元线性回归

from sklearn.linear_model import LinearRegression
import numpy as np
import matplotlib.pyplot as plt

# 载入数据
data = np.genfromtxt("data.csv", delimiter=",")
x_data = data[:,0]
y_data = data[:,1]
plt.scatter(x_data,y_data)
plt.show()
print(x_data.shape) #展示x_data的维度(100个一维数据)
print(y_data.shape) #展示y_data的维度(100个一维数据)

x_data = data[:,0,np.newaxis] #给x_data添加一个维度
y_data = data[:,1,np.newaxis] #给y_data添加一个维度
print(x_data.shape)
print(y_data.shape)

# 创建并训练模型
model = LinearRegression()  
model.fit(x_data, y_data)

print(model.coef_)
print(model.intercept_ )
model.score(x_data, y_data) # R^2

# 画图
plt.plot(x_data, y_data, 'b.')
plt.plot(x_data, model.predict(x_data), 'r')
#plt.plot(x_data,1.32243102 * x_data + 7.99102098, 'r')
plt.show()

sklearn——多元线性回归

import numpy as np
from sklearn import linear_model

# 读入数据 
data = np.genfromtxt("Delivery.csv",delimiter=',')
x_data = data[:,:-1]
y_data = data[:,-1]
print(x_data)
print(y_data)

# 创建模型
model = linear_model.LinearRegression()
model.fit(x_data, y_data)
print(model.coef_)
print(model.intercept_)
print(model.score(x_data,y_data))

# 预测
predict = model.predict(x_data)
print(predict)

x_test = [[102,4]]
predict = model.predict(x_test)
print("predict:",predict)

特征缩放

特征缩放的目的是让每个特征在建模过程中发挥同等重要的作用

min-max标准化

x ∗ = x − m i n m a x − m i n ∈ [ 0 , 1 ] x^* = \frac{x-min}{max-min} \in [0,1] x=maxminxmin[0,1]

Z-score标准化

x ∗ = x − μ σ x^* = \frac{x-\mu}{\sigma} x=σxμ

交叉验证

数据样本非常少

在这里插入图片描述

过拟合与正则化

underfitting  欠拟合
overfitting  过拟合:过分追求精度,使得样本残差平方和尽量地小,而去覆盖(拟合)样本中较异常的数据

防止过拟合

1.减少特征
2.增加数据量
3.正则化(regularized)

正则化

L1正则化:

J ( θ ) = 1 2 m [ ∑ i = 1 m ( h θ ( x ( i ) ) − y ( i ) ) 2 + λ ∑ j = 1 n ∣ θ j ∣ ] J(\theta)=\frac{1}{2m} [\sum_{i=1}^{m}(h_{\theta}(x^{(i)})-y^{(i)})^2+\lambda\sum_{j=1}^{n}|\theta_j|] J(θ)=2m1[i=1m(hθ(x(i))y(i))2+λj=1nθj]

L2正则化:

J ( θ ) = 1 2 m [ ∑ i = 1 m ( h θ ( x ( i ) ) − y ( i ) ) 2 + λ ∑ j = 1 n θ j 2 ] J(\theta)=\frac{1}{2m} [\sum_{i=1}^{m}(h_{\theta}(x^{(i)})-y^{(i)})^2+\lambda\sum_{j=1}^{n}\theta^2_j] J(θ)=2m1[i=1m(hθ(x(i))y(i))2+λj=1nθj2]

岭回归、LASSO、弹性网

岭回归(ridge regression)

1.处理特征数多于样本的情况
2.用于在估计中加入偏差,从而得到更好的估计
3.解决多重共线性问题

岭回归代价函数

J ( θ ) = 1 2 m [ ∑ i = 1 m ( h θ ( x ( i ) ) − y ( i ) ) 2 + λ ∑ j = 1 n θ j 2 ] J(\theta)=\frac{1}{2m} [\sum_{i=1}^{m}(h_{\theta}(x^{(i)})-y^{(i)})^2+\lambda\sum_{j=1}^{n}\theta^2_j] J(θ)=2m1[i=1m(hθ(x(i))y(i))2+λj=1nθj2]

L2正则项

岭系数

λ ∈ [ 0 , 1 ] \lambda \in[0,1] λ[0,1]

1.各回归系数的岭估计基本稳定
2.残差平方和增大不太多

岭系数越大, θ j \theta_j θj越小
在这里插入图片描述

LASSO

1.多重共线性问题
2.岭回归估计系数等于0的机会微乎其微,造成筛选变量困难

LASSO代价函数

J ( θ ) = 1 2 m [ ∑ i = 1 m ( h θ ( x ( i ) ) − y ( i ) ) 2 + λ ∑ j = 1 n ∣ θ j ∣ ] J(\theta)=\frac{1}{2m} [\sum_{i=1}^{m}(h_{\theta}(x^{(i)})-y^{(i)})^2+\lambda\sum_{j=1}^{n}|\theta_j|] J(θ)=2m1[i=1m(hθ(x(i))y(i))2+λj=1nθj]

L1正则项

LASSO与岭回归

在这里插入图片描述
λ \lambda λ限制 ∑ j = 1 n θ j 2 ≤ t \sum_{j=1}^{n}\theta_j^2≤t j=1nθj2t
λ \lambda λ限制 ∑ j = 1 n ∣ θ j ∣ ≤ t \sum_{j=1}^{n}|\theta_j|≤t j=1nθjt
在这里插入图片描述

LASSO倾向于使个别参数为0  (降维)

弹性网(elastic net)

结合岭回归和LASSO

弹性网代价函数

J ( θ ) = 1 2 m [ ∑ i = 1 m ( h θ ( x ( i ) ) − y ( i ) ) 2 + λ ∑ j = 1 n ( α θ j 2 + ( 1 − α ) ∣ θ j ∣ ) ] J(\theta)=\frac{1}{2m} [\sum_{i=1}^{m}(h_{\theta}(x^{(i)})-y^{(i)})^2+\lambda\sum_{j=1}^{n}(\alpha\theta^2_j+(1-\alpha)|\theta_j|)] J(θ)=2m1[i=1m(hθ(x(i))y(i))2+λj=1n(αθj2+(1α)θj)]

sklearn实现–岭回归

import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt

# 读入数据 
data = np.genfromtxt("longley.csv",delimiter=',')
print(data)

# 切分数据
x_data = data[1:,2:]
y_data = data[1:,1]
print(x_data)
print(y_data)

# 生成50个alpha值
alphas_to_test = np.linspace(0.001, 1)  #默认num为50
print(alphas_to_test)

# 建模
# 创建模型,保存误差值
model = linear_model.RidgeCV(alphas=alphas_to_test, store_cv_values=True)
model.fit(x_data, y_data)

# 打印最佳岭系数、交叉验证误差、系数
print(model.alpha_)
print(model.cv_values_)
print(model.cv_values_.shape)
print(model.coef_)
print(model.intercept_)

# 可视化
# 岭系数跟loss值的关系
plt.plot(alphas_to_test, model.cv_values_.mean(axis=0))
# 选取的最佳岭系数值的位置
plt.plot(model.alpha_, min(model.cv_values_.mean(axis=0)),'ro')
plt.show()
print(min(model.cv_values_.mean(axis=0))) #打印最小测试误差
print(model.score(x_data, y_data))

# 预测
model.predict(x_data[2,np.newaxis]) #将x_data从一维数据变成二维数据
predect_result = model.predict(x_data)
print(predect_result)
sum(abs(predect_result - y_data))/len(y_data) #计算训练误差

sklearn实现–LASSO

import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt

# 读入数据 
data = np.genfromtxt("longley.csv",delimiter=',')
print(data)

# 切分数据
x_data = data[1:,2:]
y_data = data[1:,1]
print(x_data)
print(y_data)

# 建模
# 创建模型(默认尝试100个不同的alpha值)
model = linear_model.LassoCV()  # 默认分五组
model.fit(x_data,y_data)

# lasso系数
print(model.alpha_)
print(model.coef_)
print(model.intercept_)

print(model.mse_path_.shape) #打印误差矩阵的形状
print(min(model.mse_path_.mean(axis=1)))#打印最小误差值
print(model.score(x_data, y_data)) # R^2

# 可视化
# alpha跟loss值的关系
plt.plot(model.alphas_, model.mse_path_.mean(axis=1))
# 选取的岭系数值的位置
plt.plot(model.alpha_, min(model.mse_path_.mean(axis=1)),'ro')
plt.show()

# 预测
model.predict(x_data[-2,np.newaxis])# 预测输入需为2维
predect_result = model.predict(x_data)
sum(abs(predect_result - y_data))/len(y_data)  #计算训练误差

sklearn实现–弹性网

import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt

# 读入数据 
data = np.genfromtxt("longley.csv",delimiter=',')
print(data)

# 切分数据
x_data = data[1:,2:]
y_data = data[1:,1]
print(x_data)
print(y_data)

# 建模
model = linear_model.ElasticNetCV()
model.fit(x_data, y_data)
# 弹性网系数
print(model.alpha_)
# 相关系数
print(model.coef_)
print(model.intercept_)
print(model.score(x_data, y_data))

print(model.mse_path_.shape) #打印误差矩阵的形状
print(model.mse_path_.mean(axis=1))
print(min(model.mse_path_.mean(axis=1)))#打印最小测试误差值

# 可视化
# alpha跟loss值的关系
plt.plot(model.alphas_, model.mse_path_.mean(axis=1))
# 选取的岭系数值的位置
plt.plot(model.alpha_, min(model.mse_path_.mean(axis=1)),'ro')
plt.show()

# 预测
model.predict(x_data[-2,np.newaxis])
predect_result = model.predict(x_data)
sum(abs(predect_result - y_data))/len(y_data)  #打印训练误差
  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值