06.线性回归

最新推荐文章于 2024-05-25 00:45:00 发布

风吴痕

最新推荐文章于 2024-05-25 00:45:00 发布

阅读量250

点赞数

分类专栏： ML 文章标签：机器学习线性回归

本文链接：https://blog.csdn.net/wc781708249/article/details/103960513

版权

ML 专栏收录该内容

8 篇文章 2 订阅

订阅专栏

回归（一）：线性回归

文章目录

理论推导
- 求逆(伪逆)法
- 梯度下降
实例

理论推导

通常对于一组特征数据和其标记值： $x_1, y_1), (x_2, y_2), ..., (x_n, y_n)$ ，在使用特征值 $x_i$ 对 $y_i$ 进行预测时，根据习惯，如果 $y_i$ 是连续的，则称这种操作或者技术为回归；如果 $y_i$ 是离散的，则通常称为分类。

$X\in R^{n*m}:\{x_1,x_2,...,x_n\}$

$Y\in R^{n*1}:\{y_1,y_2,...,y_n\}$

其中 $X$ 为特征数据， $n$ 条，每条属性维度为 $m$ ， $Y$ 为对应的标签

我们需找到 $w\in R^{m*1}$ , $b\in R^{1*1}$ 使得， $X w + b = Y$

进一步把 $b$ 放在 $w$ 中，则有： $Y=Xw^*$ ,其中 $X=\{1,x_1,x_2,..,x_n\}\in R^{n*(m+1)}$ ， $w^*\in R^{(m+1)*1}$

如果 $X$ 可逆，则有 $w^*=X^{-1}Y$ ，这就可以找到 $w^*$

因为： $y_1 = b+x_{11}*w_1+x_{12}*w_2+,...+x_{1n}*w_n = 1*w_0+x_{11}*w_1+x_{12}*w_2+,...+x_{1n}*w_n$

写成矩阵形式为： $y_1=[1,x_{11},x_{12},...,x_{1n}]*[w_0,w_1,w_2,...,w_n]^T$

因此就有： $Y=Xw^*$
$Y=[y_1,y_2,...,y_n]^T$ , $X=[1,x_1,x_2,...,x_n]$ , $w^*=[b,w]^T$

求逆(伪逆)法

在这里插入图片描述

from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import scipy

def plotData(X,y):
    plt.scatter(X[...,0],y)
    plt.show()

# 直接求导
def matrixSolver(X,y):
    X = np.hstack((np.ones((len(X),1)),X))
    # w = np.dot(np.linalg.inv(X),y) # 求逆
    w = np.dot(np.linalg.pinv(X), y)  # 求伪逆
    return w


if __name__=="__main__":
    X, y = make_regression(200, 1, bias=2, noise=4)
    # 划分训练集与测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)
    # plotData(X,y)

    # 直接求导
    w = matrixSolver(X_train,y_train)
    # print(w)

    # 画图
    plt.scatter(X_test,y_test,s=30,c='red',marker='o',alpha=0.5,label='C1')
    plt.plot(X_test,np.dot(np.hstack((np.ones((len(X_test),1)),X_test)),w),c="blue")

    plt.show()

梯度下降

详细推导参考：

实例

"""
Author:wucng
Time:  20200114
Summary: 线性回归对boston数据预测
源代码： https://github.com/wucng/MLAndDL
参考：https://cuijiahua.com/blog/2017/11/ml_3_decision_tree_2.html
"""

from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import numpy as np
import scipy,pickle,os,time
import pandas as pd

# 1.加载数据集（并做预处理）
def loadData(dataPath: str) -> tuple:
    with open(dataPath,"r") as fp:
        lines = fp.readlines()
        dataset=[]
        i = 0
        while i<len(lines):
            line = lines[i]
            i += 1
            if line[0].isdigit(): # 数字开头
                data1=list(map(float,line.strip().split(" ")))
                line = lines[i]
                i += 1
                data2 = list(map(float, line.strip().split(" ")))
                data1.extend(data2)
                dataset.append(data1)
            else:
                continue

        dataset = np.asarray(dataset)

        # 拆分成训练集与标签
        X,y = dataset[...,:-1],dataset[...,-1]

        # 数据归一化
        X = (X - np.min(X, axis=0)) / (np.max(X, axis=0) - np.min(X, axis=0))

        # 使用sklearn方式
        # X = MinMaxScaler().transform(X)

    return (X,y)

class LinearRegressionSelf(object):
    """求逆(伪逆)法"""
    def __init__(self,save_file="model.npy"):
        self.save_file = save_file

    def __fit(self,X,y):
        # 直接求导
        X = np.hstack((np.ones((len(X), 1)), X))
        # w = np.dot(np.linalg.inv(X),y) # 求逆
        w = np.dot(np.linalg.pinv(X), y)  # 求伪逆

        return w

    def fit(self,X,y,batch_size=32,epochs=20):
        if not os.path.exists(self.save_file):
            length = len(y)
            m = len(y)//batch_size
            last_w = []
            for epoch in range(epochs):
                w = []
                # 随机打乱数据
                index = np.arange(0, length)
                np.random.seed(epoch)
                np.random.shuffle(index)
                new_X = X[index]
                new_y = y[index]
                for i in range(m):
                    start = i*batch_size
                    end = min((i+1)*batch_size,length)
                    w.append(self.__fit(new_X[start:end],new_y[start:end]))

                last_w.append(np.mean(w,0))

            # save parameter
            np.save(self.save_file,np.mean(last_w,0))

        self.w = np.load(self.save_file)

    def predict(self,X):
        X = np.hstack((np.ones((len(X), 1)), X))
        return np.dot(X,self.w)

    def error(self,y_true,y_pred):
        # https://www.jianshu.com/p/3a98f33113ac
        # 越大，拟合的效果越好,最优值为1，并且模型的效果很离谱时可能为负
        return 1-np.sum((y_pred-y_true)**2)/np.sum((y_true-np.mean(y_true))**2)

class LinearRegressionSelf2(object):
    """梯度下降"""
    def __init__(self,save_file="model.ckpt"):
        self.save_file = save_file

    def __fit(self,X,y,w,b,lr=1e-3):
        diff = np.dot(X, w) + b - y
        w-=lr*(1/len(y))*(np.dot(np.transpose(X), diff))
        b-=lr*np.mean(diff)

        return w,b

    def fit(self,X,y,batch_size=32,epochs=50000,lr=5e-4):
        if not os.path.exists(self.save_file):
            length = len(y)
            m = len(y)//batch_size
            w = np.random.random((len(X[0]),1)) # 初始随机值
            b = np.random.random((1,1)) # 初始随机值

            for epoch in range(epochs):
                # 随机打乱数据
                index = np.arange(0, length)
                np.random.seed(epoch)
                np.random.shuffle(index)
                new_X = X[index]
                new_y = y[index]
                for i in range(m):
                    start = i*batch_size
                    end = min((i+1)*batch_size,length)
                    w,b = self.__fit(new_X[start:end],new_y[start:end],w,b,lr)

                # print(w,b)

            # save parameter
            pickle.dump({"w":w,"b":b},open(self.save_file,"wb"))

        data = pickle.load(open(self.save_file,"rb"))
        self.w = data["w"]
        self.b = data["b"]

    def predict(self,X):
        return np.dot(X,self.w)+self.b

    def error(self,y_true,y_pred):
        # https://www.jianshu.com/p/3a98f33113ac
        # 越大，拟合的效果越好,最优值为1，并且模型的效果很离谱时可能为负
        return 1-np.sum((y_pred-y_true)**2)/np.sum((y_true-np.mean(y_true))**2)

if __name__=="__main__":
    dataPath = "../../dataset/boston.txt"
    X, y = loadData(dataPath)
    if len(y.shape)==1:y=y[...,None]
    # 划分训练集与测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)

    start = time.time()
    clf = LinearRegressionSelf()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print("error:",clf.error(y_test,y_pred))
    error = np.sum((y_pred - y_test) ** 2) / len(y_test)
    print("cost time:%.6f(s) error:%.3f"%(time.time()-start,error))
    """
    error: 0.7131946712017807
    cost time:0.000985(s) error:32.785
    """
    # 使用sklearn的LinearRegression方法
    start = time.time()
    clf = LinearRegression()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    error = 1-np.sum((y_pred-y_test)**2)/np.sum((y_test-np.mean(y_test))**2)
    print("error:",error)
    error = np.sum((y_pred - y_test) ** 2) / len(y_test)
    print("cost time:%.6f(s) error:%.3f" % (time.time() - start, error))
    """
    error: 0.7215519718844166
    cost time:0.001995(s) error:31.830
    """