逻辑回归判断学生能否被大学录取

最新推荐文章于 2023-03-12 12:55:45 发布

波尔德

最新推荐文章于 2023-03-12 12:55:45 发布

阅读量1.1k

点赞数

分类专栏：笔记学习生活

本文链接：https://blog.csdn.net/weixin_44025103/article/details/125115243

版权

逻辑回归 python 机器学习

学习生活同时被 2 个专栏收录

129 篇文章 0 订阅

订阅专栏

笔记

112 篇文章 7 订阅

订阅专栏

说明：我们将建立一个逻辑回归模型来预测一个学生是否被大学录取。假设你是一个大学系的管理员，你想根据两次考试的结果来决定每个申请人的录取机会。你有以前的申请人的历史数据，你可以用它作为逻辑回归的训练集。对于每一个培训例子，你用申请人两次考试的的分数决定是否录取。为了做到这一点，我们将建立一个分类模型，根据考试成绩估计入学概率。

# pycharm完整代码
# import 三大件
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
global n
n = 10
path = 'LogiReg_data.txt'
pdData = pd.read_csv(path, header=None, names=['Exam 1', 'Exam 2', 'Admitted'])
print(pdData.head(8))
print(pdData.shape)  # 100行3列


def sigmoid(z):
    return 1 / (1 + np.exp(-z))


# 画图展示sigmoid函数
nums = np.arange(-10, 10, step=1)  # creates a vector containing 20 equally spaced values(等距离向量) from -10 to 10

fig, ax = plt.subplots(figsize=(12, 4))
ax.plot(nums, sigmoid(nums), 'r')  # 线条的颜色为红色
plt.show()


def model(X, theta):
    # 多传入1列。这样可以构造输入数据与x参数theta的矩阵运算。
    # 进行完矩阵运算后，将结果再传入sigmoid函数中。构造预测函数
    return sigmoid(np.dot(X, theta.T))


# 增加一列，列名为ones，值为全为1。
pdData.insert(0, 'Ones', 1)

# set X (training data) and y (target variable标签)
# 将pandas类型的数据转换为array类型
orig_data = pdData.values  # convert the Pandas representation of the data to an array useful for further computations
cols = orig_data.shape[1]
X = orig_data[:, 0:cols - 1]  # 从第0列到第cols-1列
y = orig_data[:, cols - 1:cols]  # 从第clos-1列到第cols列
print('X:\n', X[:6])
print('y:\n', y[:6])

theta = np.zeros([1, 3])
print(theta)


def cost(X, y, theta):
    # X 为训练数据  y 为标签  theta 为参数
    # 传到model中，会自动计算sigmoid值
    left = np.multiply(-y, np.log(model(X, theta)))
    right = np.multiply(1 - y, np.log(1 - model(X, theta)))
    # sum() 累加和
    return np.sum(left - right) / (len(X))


# 求解偏导数（梯度）
def gradient(X, y, theta):
    # 之前构造的theta参数矩阵是 1 × 3的
    # 现在进行求解时，显然也需要和之前构造的theta参数一一对应
    grad = np.zeros(theta.shape)
    #
    error = (y - model(X, theta)).ravel()  # 将y-model转变为1维数组
    # 计算 xij
    for j in range(len(theta.ravel())):  # for each parmeter
        # 一次要将所有样本都计算出来，所以一次计算需要用到所有的i值（从第1行到第i行）和第j列的值
        term = np.multiply(error, X[:, j])
        grad[0, j] = np.sum(term) / len(X) * (-1)

    return grad


STOP_ITER = 0  # 按照迭代次数进行停止
STOP_COST = 1  # 根据损失值是否变化决定是否停止。如果损失值没有变化，则停止迭代
STOP_GRAD = 2  # 根据梯度值是否变化决定是否停止。如果梯度值没有变化，则停止迭代


def stopCriterion(type, value, threshold):
    # 设定三种不同的停止策略
    if type == STOP_ITER:
        return value > threshold

    elif type == STOP_COST:
        return abs(value[-1] - value[-2]) < threshold

    elif type == STOP_GRAD:
        return np.linalg.norm(value) < threshold


import numpy.random


# 洗牌
def shuffleData(data):
    np.random.shuffle(data)
    cols = data.shape[1]
    X = data[:, 0:cols - 1]  # 训练数据
    y = data[:, cols - 1:]  # 标签
    return X, y


import time


# batchSize = 1 即为随机梯度下降
# batchSize = 26 即为minimatch
# batchSize = 总体，即为批量梯度下降
# stopType 停止策略
# thresh 停止的阈值
# alpha 学习率

def descent(data, theta, batchSize, stopType, thresh, alpha):
    # 梯度下降求解

    init_time = time.time()
    i = 0  # 迭代次数

    batch_k = 0  # 第k个batch
    X, y = shuffleData(data)
    grad = np.zeros(theta.shape)  # 计算的梯度，先用0占位
    costs = [cost(X, y, theta)]  # 损失值

    while True:
        grad = gradient(X[batch_k:batch_k + batchSize], y[batch_k:batch_k + batchSize], theta)
        batch_k += batchSize  # 取batchSize个数据
        if batch_k >= n:  # n表示样本总个数。在本例中，因为X是100×3的，所以，n更改为100
            batch_k = 0
            X, y = shuffleData(data)  # 重新洗牌
        theta = theta - alpha * grad  # 参数更新
        costs.append(cost(X, y, theta))  # 计算新的损失
        i += 1  # 迭代次数+1

        # 判断当前是否应该停止
        if stopType == STOP_ITER:
            value = i
        elif stopType == STOP_COST:
            value = costs
        elif stopType == STOP_GRAD:
            value = grad
        if stopCriterion(stopType, value, thresh): break

    return theta, i - 1, costs, grad, time.time() - init_time


def runExpe(data, theta, batchSize, stopType, thresh, alpha):
    # import pdb; pdb.set_trace();
    # 求得各个参数的结果
    theta, iter, costs, grad, duration = descent(data, theta, batchSize, stopType, thresh, alpha)

    # 装饰语句
    name = "Original" if (data[:, 1] > 2).sum() > 1 else "Scaled"
    name += " data - learning rate: {} - ".format(alpha)

    # 判断是 批量梯度下降 还是 随机梯度下降 还是 小批量梯度下降
    if batchSize == n:
        strDescType = "Gradient"
    elif batchSize == 1:
        strDescType = "Stochastic"
    else:
        strDescType = "Mini-batch ({})".format(batchSize)

    # 判断是何种停止策略
    name += strDescType + " descent - Stop: "
    if stopType == STOP_ITER:
        strStop = "{} iterations".format(thresh)
    elif stopType == STOP_COST:
        strStop = "costs change < {}".format(thresh)
    else:
        strStop = "gradient norm < {}".format(thresh)
    name += strStop
    print("***{}\nTheta: {} - Iter: {} - Last cost: {:03.2f} - Duration: {:03.2f}s".format(
        name, theta, iter, costs[-1], duration))

    # 画图
    fig, ax = plt.subplots(figsize=(12, 4))
    ax.plot(np.arange(len(costs)), costs, 'r')
    ax.set_xlabel('Iterations')
    ax.set_ylabel('Cost')
    ax.set_title(name.upper() + ' - Error vs. Iteration')
    return theta

n=100
runExpe(orig_data, theta, n, STOP_ITER, thresh=5000, alpha=0.000001)