【实战 python】第3章 线性模型 -- 线性回归、对率回归(逻辑回归)、线性判别分析LDA

理论知识:笔记(三)机器学习(周志华)第3章 线性模型

一、线性回归

算法实现:正规方程法、梯度下降法

linearRegression.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Author  : Cabbage
# @project : ML
# @FileName: linearRegression.py
# @Blog    : https://blog.csdn.net/lzbmc

'''
线性回归:单变量(一元)线性回归、多变量(多元)线性回归
下面代码是单变量线性回归。
多变量只需要在读取数据的时候x对应列增加,注意数据归一化,绘图需要降维
'''
import numpy as np
import matplotlib.pyplot as plt
from numpy import *
import pandas as pd

def loadDataSet():
    data = pd.read_csv('ex0.txt', header=None, delimiter='\t')  # 没有表头
    # print(data.head())  # 默认查看前五行,可以用来查看数据
    # # 查看数据特征之间的相关系数
    # r = data.corr()  # 0-0.3:弱相关;0.3-0.6:中等程度相关;0.6-1:强相关
    # print(r)
    xArr = data.values[:, 0:2]  # 数据第一列全为1,即X0,对应的W0是b  (200,2)
    yArr = data.values[:, -1]  # (200,1)
    return xArr, yArr

def analyzeData(x, y):
    x = x[:, 1]  # (200,)
    plt.scatter(x, y, s=15)
    return plt

def plotLrEquation(weights, xArr, yArr):
    plt = analyzeData(xArr, yArr)
    xCopy = xArr.copy()
    xCopy.sort(0)  # 按列排序
    y = xCopy * weights
    plt.plot(xCopy[:, 1], y, c='r')
    plt.show()

# 回归任务常用性能度量:均方误差。
def lr_normalEquation(x, y):
    '''
    方法一、正规方程法:求导为零时,得到w
    '''
    print('====', y.shape)
    xTx = x.T * x  # (n,m)*(m,n)=(n,n)
    if np.linalg.det(xTx) == 0:  # 计算行列式,判断是否可逆
        print('This matrix is singular, cannot do inverse')
        return
    ws = xTx.I * x.T * y.T  # (n,n)*(n,m)=(n,m) * (m,1) = (n,1)
    return ws

def lr_gradientDecent(x, y, epoch):
    '''
    方法二、梯度下降法:设置学习率和迭代次数,更新w
    '''
    alpha = 0.01
    m, n = x.shape
    weights = ones((n, 1))  # (n,1)
    cost = []  # 统计每次迭代之后的cost
    for i in range(epoch):
        error = x * weights - y.T  # (m,1)
        weights = weights - alpha * (1 / m) * x.T * error
        currCost = cost_function(x, y, weights)
        cost.append(currCost)
    return weights, cost

# 代价函数
def cost_function(x, y, weights):
    m = x.shape[0]  # 数据的个数
    cost = 1 / (2 * m) * sum(np.power((x * weights - y.T), 2))
    return cost

# 迭代次数与代价函数的关系
def plot_epochCost(cost, epoch=1000):  # 默认值参数必须放在后面
    x = range(epoch)
    y = cost
    plt.plot(x, y)
    plt.xlabel('epoch')
    plt.ylabel('cost')
    plt.show()

if __name__ == "__main__":
    dataArr, labels = loadDataSet()
    xMat = np.mat(dataArr)  # (m,n)
    yMat = np.mat(labels)  # (1,m)

    # plt = analyzeData(dataArr, labels)
    # plt.show()

    # # 最小二乘正规方程
    w = lr_normalEquation(xMat, yMat)
    print(w)
    plotLrEquation(w, dataArr, labels)

    # # 梯度下降法
    # epoch = 1000
    # w, cost = lr_gradientDecent(xMat, yMat, epoch)
    # plotLrEquation(w, dataArr, labels)
    # plot_epochCost(cost, epoch)

sklearn实现

linearRegression_sklearn.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Author  : Cabbage
# @project : ML
# @FileName: linearRegression_sklearn.py
# @Blog    : https://blog.csdn.net/lzbmc

import numpy as np
from sklearn.linear_model import LinearRegression
import pandas as pd

def loadDataSet():
    data = pd.read_csv('ex0.txt', header=None, delimiter='\t')  # 没有表头
    # print(data.head())  # 默认查看前五行,可以用来查看数据
    # # 查看数据特征之间的相关系数
    # r = data.corr()  # 0-0.3:弱相关;0.3-0.6:中等程度相关;0.6-1:强相关
    # print(r)
    trainSet = np.array(data)  # 转化为数组
    xArr = trainSet[:, 0:2]  # 数据第一列全为1,即X0,对应的W0是b  (200,2)
    yArr = trainSet[:, -1]  # (200,1)
    return xArr, yArr

def lr_sklearn(xArr, yArr):
    model = LinearRegression()
    model.fit(xArr, yArr)
    weights = model.coef_  # 回归系数
    bias = model.intercept_  # 截距
    return weights, bias

if __name__ == '__main__':
    dataArr, labels = loadDataSet()
    # xMat = np.mat(dataArr)  # (m,n)
    # yMat = np.mat(labels)  # (1,m)
    weights, bias = lr_sklearn(dataArr, labels)
    print(weights, bias)

二、对数几率回归(逻辑回归)3.3

算法实现

  • 3.3 编程实现对率回归,并给出西瓜数据集3.0α上的结果
    logisticRegression.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Author  : Cabbage
# @project : ML
# @FileName: logisticRegression.py
# @Blog    : https://blog.csdn.net/lzbmc
# 机器学习实战
'''
编程实现对率回归,并给出西瓜数据集3.0α上的结果
'''
import pandas as pd
import numpy as np
from numpy import *

# 加载数据,结构化数据格式
def loadDataSet0():
    raw_data = pd.read_csv('../watermelon3.0a')
    dataSet = raw_data.values[0:, 1:-1]
    m = dataSet.shape[0]
    x0 = ones(m)
    dataArr = np.insert(dataSet, 0, values=x0, axis=1)  # 第一列插入1,b当做w0,对应x0为1
    labelsArr = raw_data.values[0:, -1]  # <class 'numpy.ndarray'>
    return dataArr, labelsArr

def loadDataSet1():
    dataMat = []
    labelMat = []
    fr = open('../watermelon3.0a', encoding='utf-8')
    for line in fr.readlines():
        if '编号' in line:
            continue
        lineArr = line.strip().split(',')
        dataMat.append([1.0, float(lineArr[1]), float(lineArr[2])])  # b看做w0,对应x0=1
        labelMat.append(int(lineArr[3]))  # <class 'list'>
    return dataMat, labelMat

# matrix的优势就是相对简单的运算符号,比如两个矩阵相乘,就是用符号*,但是array相乘不能这么用,得用方法.dot()
# array的优势就是不仅仅表示二维,还能表示3、4、5…维,而且在大部分Python程序里,array也是更常用的。

# 激活函数
def sigmoid(inputX):
    return 1.0 / (1 + exp(-inputX))

# 一、标准梯度上升:计算整个数据集的梯度,计算复杂度太高。梯度上升求函数最大值,梯度下降求最小值
def gradAscent(dataArr, classLabels):
    dataMat = mat(dataArr)  # <class 'numpy.matrixlib.defmatrix.matrix'>  数组array转换成matrix
    labelMat = mat(classLabels)  # ndarray转置很麻烦,并且*是对应位置元素相乘。所以转换成矩阵
    # 初始化权重矩阵
    m, n = shape(dataMat)
    weights = ones((n, 1))   # 数组
    # print('standard:', type(weights))  # <class 'numpy.ndarray'>
    # 设置学习率(目标移动的步长)和迭代次数
    alpha = 0.001
    maxCycles = 500
    # 参数更新
    for i in range(maxCycles):
        predict = sigmoid(dataMat * weights)  # (m,1)
        error = labelMat.T - predict
        weights = weights + alpha * dataMat.T * error  # (n,m)*(m,1)
    # print('standard:', type(weights))  # <class 'numpy.matrixlib.defmatrix.matrix'>
    return weights   # 矩阵

import matplotlib.pyplot as plt
# 分析数据,画出决策边界
def plotBestFit(weights):
    dataSet, labels = loadDataSet0()  # dataSet每一行是一条数据,第一列对应b的参数,其余两列是特征,画图对应x,y。
    # dataArr = array(dataSet)  # <class 'numpy.ndarray'> loadDataSet1时将列表转换成数组
    m = shape(dataSet)[0]
    xcord1 = []; ycord1 = []
    xcord2 = []; ycord2 = []
    for i in range(m):
        if int(labels[i]) == 1:
            xcord1.append(dataSet[i, 1])
            ycord1.append(dataSet[i, 2])
        else:
            xcord2.append(dataSet[i, 1])
            ycord2.append(dataSet[i, 2])
    fig = plt.figure()  # 创建图像
    ax = fig.add_subplot(111)  # 子图
    ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
    ax.scatter(xcord2, ycord2, s=30, c='green')
    x = arange(-0.5, 1.0, 0.1)
    y = (-weights[0] - weights[1] * x) / weights[2]  # 0 = w0x0 + w1x + w2y
    ax.plot(x, y)
    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.show()

# 随机梯度上升:对数据中的每个样本,计算梯度,更新参数
def stocGradAscent0(dataArr, classLabels):
    m, n = shape(dataArr)
    alpha = 0.001
    weights = ones(n)  # <class 'numpy.ndarray'>  (1, n)
    for iter in range(200):  # 在数据集上运行200次
        for i in range(m):
            predict = sigmoid(sum(dataArr[i] * weights))  # 一个数值
            error = classLabels[i] - predict
            weights = weights + alpha * dataArr[i] * error
    # print('stochastic:', type(weights))  # <class 'numpy.ndarray'>
    return weights   # 数组

# 改进的随机梯度上升:随着训练次数增加学习步长减小,随机选取样本,选完之后删除此样本
def stocGradAscent1(dataArr, classLabels, numIter=150):  # 增加参数迭代次数
    m, n = shape(dataArr)
    weights = ones(n)
    for iter in range(numIter):
        dataIndex = list(range(m))  # python3要加list,因为python3中range不返回数组对象,而是返回range对象
        for i in range(m):
            randIndex = int(random.uniform(0, len(dataIndex)))  # len(dataIndex)是变化的
            predict = sigmoid(sum(dataArr[randIndex] * weights))
            error = classLabels[randIndex] - predict
            alpha = 4 / (1 + iter + i) + 0.001
            weights = weights + alpha * dataArr[randIndex] * error
            del(dataIndex[randIndex])
    return weights  # 数组

def classifyVector(inputX, weights):
    prob = sigmoid(sum(inputX * weights))  # 矩阵的*:矩阵乘法; 数组的*:对应位置元素相乘。所以要sum
    # print(prob)
    if prob > 0.5:
        return 1.0
    else:
        return 0.0

if __name__ == '__main__':
    dataArr, labels = loadDataSet1()  # 列表
    # dataArr0, labels0 = loadDataSet0()  # 数组

    weights = gradAscent(dataArr, labels)  # 矩阵
    print(weights)
    plotBestFit(weights.getA())  # 注意!!!a.getA()是将numpy矩阵转换为数组,与mat(a)功能相反

    # weights2 = stocGradAscent0(array(dataArr), labels)  # 数组
    # # weights2 = stocGradAscent0(dataArr, labels)
    # plotBestFit(weights2)

    # weights = stocGradAscent1(array(dataArr), labels)  # 数组
    # print(weights)
    # plotBestFit(weights)
    testData = [1, 0.697, 0.460]
    result = classifyVector(testData, weights)
    print(result)
  • 机器学习实战:从疝气病症预测病马的死亡率
    colic.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Author  : Cabbage
# @project : ML
# @FileName: colic.py
# @Blog    : https://blog.csdn.net/lzbmc
# 机器学习实战:从疝气病症预测病马的死亡率

from logisticRegression import stocGradAscent1, classifyVector
from numpy import *
def colicTest():
    frTrain = open('../horseColic/horseColicTraining.txt')
    frTest = open('../horseColic/horseColicTest.txt')
    trainingSet = []
    trainingLabels = []
    for line in frTrain.readlines():
        currLine = line.strip().split('\t')
        lineArr = []  # 可以把b当做哑结点w0, lineArr=[1.0]
        lastIndex = len(currLine) - 1  # 最后一列的索引 21
        for i in range(lastIndex):
            lineArr.append(float(currLine[i]))
        trainingSet.append(lineArr)
        trainingLabels.append(float(currLine[-1]))
    # 最优参数
    trainingWeights = stocGradAscent1(array(trainingSet), trainingLabels, numIter=500)
    # print(shape(trainingWeights))

    # 测试集处理
    errorCount= 0
    numTestSet = 0.0  # 统计测试集数目
    for line in frTest.readlines():
        numTestSet += 1.0
        currLine = line.strip().split('\t')
        lineArr = []
        for i in range(21):
            lineArr.append(float(currLine[i]))
        if int(classifyVector(array(lineArr), trainingWeights)) != int(currLine[-1]):
            errorCount += 1
    errorRate = float(errorCount) / numTestSet
    print('the error rate of this test is: %f' % errorRate)
    return errorRate

# 多次运算求平均错误率
def multiTest(numTests = 10):
    errorRateSum = 0.0
    for k in range(numTests):
        errorRateSum += colicTest()
        aveErrorRate = errorRateSum / float(numTests)
        print('after %d iterations the average error rate is: %f' % (k+1, aveErrorRate))

if __name__ == '__main__':
    # colicTest()
    multiTest()

sklearn实现

logisticRegression_sklearn.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Author  : Cabbage
# @project : ML
# @FileName: logisticRegression_sklearn.py
# @Blog    : https://blog.csdn.net/lzbmc

'''
solver参数决定了我们对逻辑回归损失函数的优化方法,有五种算法可以选择,分别是:
a) liblinear:使用了开源的liblinear库实现,内部使用了坐标轴下降法来迭代优化损失函数。小数数据集
b) lbfgs:拟牛顿法的一种,利用损失函数二阶导数矩阵即海森矩阵来迭代优化损失函数。
c) newton-cg:也是牛顿法家族的一种,利用损失函数二阶导数矩阵即海森矩阵来迭代优化损失函数。
d) sag:即随机平均梯度下降,是梯度下降法的变种,和普通梯度下降法的区别是每次迭代仅仅
        用一部分的样本来计算梯度,适合于样本数据多的时候。大数据集。
e) sage:大数据集
https://scikit-learn.org/dev/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression
'''

from sklearn.linear_model import LogisticRegression
import pandas as pd

raw_train = pd.read_csv('../horseColic/horseColicTraining.txt', header=None, delimiter='\t')  # 没有表头
raw_test = pd.read_csv('../horseColic/horseColicTest.txt', header=None, delimiter='\t')
# print(type(raw_train))  # <class 'pandas.core.frame.DataFrame'>
# print(type(raw_train.values))  # <class 'numpy.ndarray'>

train = raw_train.values[:, 1:2]
trainLabels = raw_train.values[:, -1]

test = raw_test.values[:, 0:-1]
testLabels = raw_test.values[:, -1]

LR = LogisticRegression()
LR.fit(train, trainLabels)

score = LR.score(test, testLabels)
print(score)   # 0.7313432835820896

  • 补充:NumPy数组 和 Numpy矩阵 的区别
    array_matrix.py
import numpy as np

''' ndarray '''
x = np.array([[1, 2], [3, 4]])  # <class 'numpy.ndarray'>
y = np.array([[5, 6], [7, 8]])
print(type(x))
print('array *:', '\n', x * y)  # 对应位置元素相乘
print('array dot:', '\n', np.dot(x, y))  # 矩阵乘法
print('array multiply:', '\n', np.multiply(x, y))  # 对应位置元素相乘

a = np.array([1, 2])
print(a ** 2)  # 对应位置元素平方
print(np.power(a, 2))  # 对应位置元素平方

# 一维转置:用reshape
data = np.arange(5)
print(data, data.T)   # 不会转置
print(data.transpose())   # 不会转置
print(data.reshape((5,1)))

print('==============我是华丽分割线==============')

''' matrix '''
a = np.mat([[1, 2], [3, 4]])  # <class 'numpy.matrixlib.defmatrix.matrix'>
b = np.mat([[5, 6], [7, 8]])
print(type(a))
print('mat *:', '\n', a * b)  # 矩阵乘法
print('mat dot:', '\n', np.dot(a, b))  # 矩阵乘法
print('mat multiply:', '\n', np.multiply(a, b))  # 对应位置元素相乘

a = np.mat([[1, 2], [2, 3]])
print(a ** 2)  # 矩阵**平方必须是方阵
a = np.mat([1, 2])
print(np.power(a, 2))  # 对应位置元素平方

a = [1, 2, 3]  # 列表和numpy数组相乘:对应位置元素平方
b = np.array([1, 1, 1])
print(a * b)

二、线性判别分析LDA(3.5)

算法实现 & sklearn实现

  • 3.5 编程实现线性判别分析,并给出西瓜数据集3.0a上的结果
#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Author  : Cabbage
# @project : ML
# @FileName: LDA.py
# @Blog    : https://blog.csdn.net/lzbmc
# 参考:https://cloud.tencent.com/developer/article/1099252
'''
编程实现线性判别分析,并给出西瓜数据集3.0a上的结果
'''
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def processData():
    raw_data = pd.read_csv('../watermelon3.0a')
    x = raw_data.values[:, 1:-1]
    y = raw_data.values[:, -1]
    # 分类
    x0 = x[y == 0]  # (numX1, 2)
    x1 = x[y == 1]
    # print(x[y == 0, 0])  # y=0对应的索引的第一列 [0.666 0.243 0.245 0.343 0.639 0.657 0.36  0.593 0.719]
    return x0, x1

def LDA(x0, x1):
    # 计算各个属性的均值
    u0 = np.array([x0[:, 0].mean(), x0[:, 1].mean()])   # (1,n) # u0 = x0.mean(0) 同等作用,按列求平均
    u1 = x1.mean(0)

    # 类内散度矩阵
    sw = np.dot((x0 - u0).T, x0 - u0) + np.dot((x1 - u1).T, x1 - u1)
    W = np.dot(np.linalg.inv(sw), (u0 - u1).T)
    print(W)

    # 解决中文乱码
    plt.rcParams['font.sans-serif'] = [u'SimHei']
    plt.rcParams['axes.unicode_minus'] = False
    plt.scatter(x0[:, 0], x0[:, 1], s=15, color='g', label='坏瓜')
    plt.scatter(x1[:, 0], x1[:, 1], s=15, color='r', label='好瓜')
    plt.legend(loc='upper left')
    plt.xlabel('密度')
    plt.ylabel('含糖率')
    # plt.show()
    pl = -(0.2 * W[0] - 0.01) / W[1]
    pr = -(0.8 * W[0] - 0.01) / W[1]
    plt.plot([0.2, 0.8], [pl, pr])
    plt.show()

def LDA_sklearn():
    raw_data = pd.read_csv('iris.data2', header=None)
    X = raw_data.values[:, 0:4]
    y = raw_data.values[:, -1]
    # 设置压缩到1维
    lda = LinearDiscriminantAnalysis(n_components=1)
    '''利用线性判别函数将四维的样本数据压缩到一条直线上'''
    lda.fit(X, y)
    print(lda.score(X, y))

    '''展示LDA的降维功能'''
    X_r2 = lda.fit(X, y).transform(X)
    X_Zero = np.zeros(X_r2.shape)
    print(X_Zero.shape)
    '''绘制降维效果图'''
    for c, i in zip('ryb', [0, 1, 2]):
        plt.scatter(X_r2[y == i], X_Zero[y == i], c=c, s=5)
    plt.legend()
    plt.grid()
    plt.show()

    '''划分数据集,进行训练和预测'''
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    print(y_test)
    lda = LinearDiscriminantAnalysis(n_components=1)
    lda.fit(X_train, y_train)

    # y_pred = lda.predict(X_test)
    # score = accuracy_score(y_pred, y_test)
    # !!!上面两句等价于
    score = lda.score(X_test, y_test)
    print('准确率:', score)


if __name__ == '__main__':
    x0, x1 = processData()
    # LDA(x0, x1)

    LDA_sklearn()

3.4 选择两个UCI数据集,比较10折交叉验证法和留一法所估计出的对率回归的错误率

答案链接

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值