【实战 python】第3章线性模型 -- 线性回归、对率回归（逻辑回归）、线性判别分析LDA

最新推荐文章于 2022-07-20 08:00:00 发布

B&&C

最新推荐文章于 2022-07-20 08:00:00 发布

阅读量1.2k

点赞数

分类专栏：机器学习代码文章标签：代码

本文链接：https://blog.csdn.net/lzbmc/article/details/100587606

版权

一、线性回归
- - 算法实现：正规方程法、梯度下降法
  - sklearn实现
二、对数几率回归（逻辑回归）3.3
- - 算法实现
  - sklearn实现
二、线性判别分析LDA（3.5）
- - 算法实现 & sklearn实现
3.4 选择两个UCI数据集，比较10折交叉验证法和留一法所估计出的对率回归的错误率

理论知识：笔记（三）机器学习（周志华）第3章线性模型

一、线性回归

算法实现：正规方程法、梯度下降法

linearRegression.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Author  : Cabbage
# @project : ML
# @FileName: linearRegression.py
# @Blog    : https://blog.csdn.net/lzbmc

'''
线性回归：单变量(一元)线性回归、多变量(多元)线性回归
下面代码是单变量线性回归。
多变量只需要在读取数据的时候x对应列增加，注意数据归一化，绘图需要降维
'''
import numpy as np
import matplotlib.pyplot as plt
from numpy import *
import pandas as pd

def loadDataSet():
    data = pd.read_csv('ex0.txt', header=None, delimiter='\t')  # 没有表头
    # print(data.head())  # 默认查看前五行，可以用来查看数据
    # # 查看数据特征之间的相关系数
    # r = data.corr()  # 0-0.3：弱相关；0.3-0.6：中等程度相关；0.6-1：强相关
    # print(r)
    xArr = data.values[:, 0:2]  # 数据第一列全为1，即X0，对应的W0是b  (200,2)
    yArr = data.values[:, -1]  # (200,1)
    return xArr, yArr

def analyzeData(x, y):
    x = x[:, 1]  # (200,)
    plt.scatter(x, y, s=15)
    return plt

def plotLrEquation(weights, xArr, yArr):
    plt = analyzeData(xArr, yArr)
    xCopy = xArr.copy()
    xCopy.sort(0)  # 按列排序
    y = xCopy * weights
    plt.plot(xCopy[:, 1], y, c='r')
    plt.show()

# 回归任务常用性能度量：均方误差。
def lr_normalEquation(x, y):
    '''
    方法一、正规方程法：求导为零时，得到w
    '''
    print('====', y.shape)
    xTx = x.T * x  # (n,m)*(m,n)=(n,n)
    if np.linalg.det(xTx) == 0:  # 计算行列式，判断是否可逆
        print('This matrix is singular, cannot do inverse')
        return
    ws = xTx.I * x.T * y.T  # (n,n)*(n,m)=(n,m) * (m,1) = (n,1)
    return ws

def lr_gradientDecent(x, y, epoch):
    '''
    方法二、梯度下降法：设置学习率和迭代次数，更新w
    '''
    alpha = 0.01
    m, n = x.shape
    weights = ones((n, 1))  # (n,1)
    cost = []  # 统计每次迭代之后的cost
    for i in range(epoch):
        error = x * weights - y.T  # (m,1)
        weights = weights - alpha * (1 / m) * x.T * error
        currCost = cost_function(x, y, weights)
        cost.append(currCost)
    return weights, cost

# 代价函数
def cost_function(x, y, weights):
    m = x.shape[0]  # 数据的个数
    cost = 1 / (2 * m) * sum(np.power((x * weights - y.T), 2))
    return cost

# 迭代次数与代价函数的关系
def plot_epochCost(cost, epoch=1000):  # 默认值参数必须放在后面
    x = range(epoch)
    y = cost
    plt.plot(x, y)
    plt.xlabel('epoch')
    plt.ylabel('cost')
    plt.show()

if __name__ == "__main__":
    dataArr, labels = loadDataSet()
    xMat = np.mat(dataArr)  # (m,n)
    yMat = np.mat(labels)  # (1,m)

    # plt = analyzeData(dataArr, labels)
    # plt.show()

    # # 最小二乘正规方程
    w = lr_normalEquation(xMat, yMat)
    print(w)
    plotLrEquation(w, dataArr, labels)

    # # 梯度下降法
    # epoch = 1000
    # w, cost = lr_gradientDecent(xMat, yMat, epoch)
    # plotLrEquation(w, dataArr, labels)
    # plot_epochCost(cost, epoch)

sklearn实现

linearRegression_sklearn.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Author  : Cabbage
# @project : ML
# @FileName: linearRegression_sklearn.py
# @Blog    : https://blog.csdn.net/lzbmc

import numpy as np
from sklearn.linear_model import LinearRegression
import pandas as pd

def loadDataSet():
    data = pd.read_csv('ex0.txt', header=None, delimiter='\t')  # 没有表头
    # print(data.head())  # 默认查看前五行，可以用来查看数据
    # # 查看数据特征之间的相关系数
    # r = data.corr()  # 0-0.3：弱相关；0.3-0.6：中等程度相关；0.6-1：强相关
    # print(r)
    trainSet &#