西瓜书习题3.4(交叉验证法和留一法)

3.4 选择两个UCI数据集,比较10折交叉验证法和留一法所估计出的对率回归的错误率。

使用的UCI数据集是关于乳腺癌的。其描述为:
在这里插入图片描述
数据集链接为Breast Cancer Coimbra

代码

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


def getDataSet(filename):
    data = pd.read_csv(filename)
    dataSet = data.values  # pd.DataFrame to np.array
    dataArr = dataSet[:, :-1]  # feature array
    labelArr = dataSet[:, -1]  # label array

    # 标准化
    # dataMean = dataArr.mean(axis=0)
    dataMax = dataArr.max(axis=0)
    dataMin = dataArr.min(axis=0)
    dataArr = (dataArr - dataMax) / (dataMax - dataMin)

    # 增加偏置参数
    dataArr = np.insert(dataArr, 0,
                        np.ones(dataArr.shape[0]),
                        axis=1)
    # 因为原来数据中1代表健康, 2代表病人,用逻辑回归要将2变为0
    labelArr[labelArr == 2] = 0
    return dataArr, labelArr

# # test getDataSet()
# dataArr, labelArr = getDataSet('dataR2.csv')
# print("dataArr's shape is ", dataArr.shape)
# print("labelArr's shape is ", labelArr.shape)
# print(labelArr)


def sigmoid(Z):
    return 1.0 / (1 + np.exp(-Z))


def newton(dataArr, labelArr):
    """
    calculate logistic parameters by newton method

    :param dataArr: input data set with shape (m, n)
    :param labelArr: the label of data set with shape (m, 1)

    :return: returns the parameters obtained by newton method
    """
    m, n = dataArr.shape
    labelArr = labelArr.reshape(-1, 1)
    beta = np.ones((n, 1))
    errList = [] # save error history

    z = np.dot(dataArr, beta)
    oldLbeta = 0
    # shape (m, 1)
    newLBetaMat = -labelArr * z + np.log(1 + sigmoid(z))
    newLBeta = np.sum(newLBetaMat)
    it = 0
    while abs(oldLbeta - newLBeta) > 1e-5:
        it += 1
        # py0 = p(y=0|x) with shape (m,1)
        py0 = sigmoid(-np.dot(dataArr, beta))
        py1 = 1 - py0
        # 'reshape(n)' get shape (n,); 'np.diag' get diagonal matrix with shape (m,m)
        p = np.diag((py0 * py1).reshape(m))

        # shape (m,n)
        dBetaMat = -dataArr * (labelArr - py1)
        # first derivative with shape (1, n)
        dBeta = np.sum(dBetaMat, axis=0, keepdims=True)
        # second derivative with shape (n, n)
        dBeta2 = dataArr.T.dot(p).dot(dataArr)
        try:
            dBeta2Inv = np.linalg.inv(dBeta2)
            # (n,1) (n,1)          (n,n)    (n,1)
            beta = beta - np.dot(dBeta2Inv, dBeta.T)

            z = np.dot(dataArr, beta)
            oldLbeta = newLBeta
            newLBetaMat = -labelArr * z + np.log(1 + sigmoid(z))
            newLBeta = np.sum(newLBetaMat)

            pre = predict(beta, dataArr)
            errorRate = cntErrRate(pre, labelArr)
            errList.append(errorRate)
        except:
            print("maxtrix is singular, "
                  "now newton iteration is ", it)
            break;
    return beta, errList


def gradDescent(dataArr, labelArr, alpha, T):
    """
    calculate logistic parameters by gradient descent

    :param dataArr: input data set with shape (m, n)
    :param labelArr: the label of data set with shape (m, 1)
    :param alpha: step length (learning rate)
    :param T: iteration
    :return: parameters
    """
    m, n = dataArr.shape
    labelArr = labelArr.reshape(-1, 1)
    errList = []

    beta = np.ones((n, 1))
    for t in range(T):
        # py0 = p(y=1|x) with shape (m,1)
        py1 = sigmoid(np.dot(dataArr, beta))
        dBetaMat = -dataArr * (labelArr - py1)
        # shape (1,n)
        dBeta = np.sum(dBetaMat, axis=0, keepdims=True)
        beta -= alpha * dBeta.T

        # test code
        # pre = predict(beta, dataArr)
        # errorRate = cntErrRate(pre, labelArr)
        # errList.append(errorRate)

    return beta


def predict(beta, dataArr):
    preArr = sigmoid(np.dot(dataArr, beta))
    preArr[preArr > 0.5] = 1
    preArr[preArr <= 0.5] = 0

    return preArr


def cntErrRate(preLabel, label):
    """
    calculate error rate
    :param preLabel: predict label
    :param label: real label
    :return: error rate
    """
    # print(f"preLabel = {preLabel}, label = {label}")
    m = len(preLabel)
    cnt = 0.0

    for i in range(m):
        if preLabel[i] != label[i]:
            cnt += 1.0
    return cnt / float(m)


def kFold(dataArr, labelArr, k, alpha, T):
    """
    k-fold cross validation
    :param dataArr:
    :param labelArr:
    :param k:
    :param alpha: learning rate
    :param T: iteration
    :return: errList has k error rate
    """
    m, n = dataArr.shape
    index = np.arange(m)
    np.random.shuffle(index)
    dataArr = dataArr[index]
    labelArr = labelArr[index]

    errList = []
    betaList = []
    step = int(m / k)
    # 一个数据分为10部分,让不整除的部分归在最后一个子集中。先计算前9部分。
    for i in range(k - 1):
        # get train data set and test data set
        trainData = np.concatenate((dataArr[:i*step], dataArr[(i+1)*step:]))
        trainLabel = np.concatenate((labelArr[:i*step], labelArr[(i+1)*step:]))
        testData = dataArr[i*step:(i+1)*step]
        testLabel = labelArr[i*step:(i+1)*step]

        # get predict
        beta = gradDescent(trainData, trainLabel, alpha, T)
        pre = predict(beta, testData)
        errList.append(cntErrRate(pre, testLabel))
        betaList.append(beta)

    # 计算最后一个区间为测试集的情况
    beta = gradDescent(dataArr[:(k-1)*step],
                          labelArr[:(k-1)*step],
                          alpha, T)
    pre = predict(beta, dataArr[(k-1)*step:])
    errList.append(cntErrRate(pre, labelArr[(k-1)*step:]))
    betaList.append(beta)

    return betaList, errList


def LOO(dataArr, labelArr, alpha, T):
    """
    Leave-One-Out
    :param dataArr:
    :param labelArr:
    :param alpha:
    :param T:
    :return: errList
    """
    m, n = dataArr.shape
    errList = []
    for i in range(m):
        trainData = np.concatenate((dataArr[:i], dataArr[i+1:]))
        trainLabel = np.concatenate((labelArr[:i], labelArr[i+1:]))
        beta = gradDescent(trainData, trainLabel, alpha, T)
        pre = predict(beta, dataArr[i])
        errList.append(cntErrRate(pre, [labelArr[i]]))

    return errList



def main():
    dataArr, labelArr = getDataSet('dataR2.csv')

    # test kFold()
    k = 10
    learningRate = 0.01
    T = 1500
    betaList, errList = kFold(dataArr, labelArr, k, learningRate, T)
    averErr = sum(errList) / float(k)
    print(f"{k} fold average error rate is {averErr}")
    # 10次10折交叉验证结果
    i = 10
    kkErrList = []
    while i:
        i -= 1
        _, errList = kFold(dataArr, labelArr, k, learningRate, T)
        averageErr = sum(errList) / float(k)
        kkErrList.append(averageErr)
    kkAverErr = sum(kkErrList) / float(k)
    print(f"{k}-th {k} fold average error rate is {kkAverErr}")

    # test LOO()
    errList = LOO(dataArr, labelArr, 0.01, 1500)
    averErr = sum(errList) / float(len(errList))
    print(f"LOO error rate is {averErr}")


if __name__ == '__main__':
    main()

打印的结果为:
在这里插入图片描述

总结

1.参数的选择的影响很大,目前还没有系统的选择参数,只是试探性的选择。

2.一开始得到的结果十分的不好,后来数据集进行了归一化,结果得到了明显的提升。因为某些特征之间数量级太大会影响学习。

  • 0
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值