Python机器学习1-吴恩达课后作业-Logistic回归分析ex2

1. Logistic回归分析

分析对象

假设您是一个大学部门的管理员,您想根据申请人在两次考试中的成绩来确定他们的入学机会。文件 ex2data1.txt 包含申请人在两门考试中的成绩和录取决定。您的任务是构建一个逻辑回归模型,根据这两门考试的分数估计申请人被录取的概率。

计算程序

from collections import OrderedDict
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

def load_data(file_path):
    """
    加载数据
    """
    data = pd.read_csv(file_path, header=None, names=['test1', 'test2', 'scores'])
    return data

def deal_data(data):
    """
    提取特征 X 和标签 y
    """
    X = data.iloc[:, :-1].values
    y = data.iloc[:, -1].values.reshape(-1, 1)

    # 添加偏置项到 X
    X = np.c_[np.ones(X.shape[0]), X]
    return X, y

def sigmoid(z):
    """
    Sigmoid 激活函数
    """
    return 1 / (1 + np.exp(-z))

def gradient_descent(X_b, y, learning_rate, num_iterations):
    """
    梯度下降训练逻辑回归模型
    """
    # 初始化参数
    theta = np.zeros((X_b.shape[1], 1))

    # 记录每次迭代的代价
    costs = []

    for _ in range(num_iterations):
        # 计算预测值
        predictions = sigmoid(np.dot(X_b, theta))
        epsilon = 1e-15
        predictions = np.clip(predictions, epsilon, 1 - epsilon)

        # 计算代价函数
        cost = -np.mean(y * np.log(predictions) + (1 - y) * np.log(1 - predictions))
        costs.append(cost)

        # 计算梯度
        gradient = np.dot(X_b.T, (predictions - y)) / X_b.shape[0]

        # 更新参数
        theta -= learning_rate * gradient

    return theta, costs

def predict(theta, X):
    """
    预测输出结果
    """
    probability = sigmoid(X.dot(theta))
    return [1 if x >= 0.5 else 0 for x in probability]

def evaluate_logistic_regression(X, y, theta):
    """
    对决策边界进行可视化
    """
    for i in range(0, X.shape[0]):
        if y[i, 0] == 1:
            plt.scatter(X[i, 1], X[i, 2], marker='+', c='black', label='Admitted')
        elif y[i, 0] == 0:
            plt.scatter(X[i, 1], X[i, 2], marker='o', c='yellow', label='Not admitted')
    plt.xticks(np.arange(30, 110, 10))
    plt.yticks(np.arange(30, 110, 10))
    plt.xlabel('Exam 1 score')
    plt.ylabel('Exam 2 score')
    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = OrderedDict(zip(labels, handles))
    plt.legend(by_label.values(), by_label.keys())
    minX = np.min(X[:, 1])
    maxX = np.max(X[:, 1])
    xx = np.linspace(minX, maxX, 100)
    yy = (theta[0][0] + theta[1][0] * xx) / (-theta[2][0])
    plt.plot(xx, yy)
    plt.show()

if __name__ == '__main__':
    path = '.\\data\\ex2data1.txt'
    data = load_data(path)
    X, y = deal_data(data)

    num_iterations = 200000
    learning_rate = 0.001
    theta, costs = gradient_descent(X, y, learning_rate, num_iterations)

    # 输出结果
    print(f"Theta: {theta}")
    predictions = predict(theta, X)
    accuracy = np.mean(predictions == y.flatten()) * 100
    print(f'Accuracy: {accuracy}%')

    # 绘制代价函数曲线
    plt.plot(costs)
    plt.xlabel('Iterations')
    plt.ylabel('Cost')
    plt.title('Cost Function')
    plt.show()

    evaluate_logistic_regression(X, y, theta)

运行结果

说明

2. Logistic分析-正则化

分析对象

假设您是工厂的产品经理,在两个不同的测试中有一些微芯片的测试结果。从这两项测试中,您想确定该微芯片是否应该被接受或拒绝。文件 ex2data2.txt 中有一个关于过去微芯片测试结果的数据集。您将实现正则化逻辑回归来预测来自制造厂的微芯片是否通过质量保证(QA)。

计算程序

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

def load_data(file_path):
    """
    加载数据
    """
    data = pd.read_csv(file_path, header=None, names=['test1', 'test2', 'scores'])
    return data

def deal_data(degree, data):
    """
    从输入数据中提取特征 X 和标签 y,并进行特征映射和归一化处理
    """
    X = data.iloc[:, :-1].values
    y = data.iloc[:, -1].values.reshape(-1, 1)

    # 对每个阶数进行特征映射
    for j in range(1, degree):
        for i in range(0, j):
            # 创建新的多项式特征并添加到 X 中
            new_feature = np.power(X[:, 0], (j - i)) * np.power(X[:, 1], i)
            X = np.c_[X, new_feature]

    # 特征缩放 - 归一化
    X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)

    # 添加一列全为1的偏置项到 X
    X = np.c_[np.ones(X.shape[0]), X]
    return X, y

def sigmoid(z):
    """
    Sigmoid 激活函数
    """
    return 1 / (1 + np.exp(-z))

def gradient_descent(X_b, y, learning_rate, num_iterations, lamda):
    """
    使用梯度下降训练逻辑回归模型,包括正则化项
    """
    num_samples, num_features = X_b.shape
    # 记录每次迭代的代价
    costs = []
    theta = np.zeros((X_b.shape[1], 1))
    for _ in range(num_iterations):
        # 计算预测值
        predictions = sigmoid(np.dot(X_b, theta))
        epsilon = 1e-15
        predictions = np.clip(predictions, epsilon, 1 - epsilon)

        # 计算代价函数,包括正则化项
        cost = -np.mean(y * np.log(predictions) + (1 - y) * np.log(1 - predictions)) + \
               (lamda / (2 * num_samples)) * np.sum(np.square(theta[1:]))
        costs.append(cost)

        # 计算梯度
        gradient = np.dot(X_b.T, (predictions - y)) / num_samples

        # 更新参数,包括正则化项的更新
        theta -= learning_rate * gradient + (lamda / num_samples) * np.vstack((0, theta[1:]))

    return theta, costs

# 预测输出结果
def predict(theta, X):
    probability = sigmoid(X.dot(theta))
    return [1 if x >= 0.5 else 0 for x in probability]

# 计算分类器的准确度
def compute_accuracy(predictions, y):
    correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions, y)]
    accuracy = (sum(correct) / len(correct)) * 100
    return accuracy


if __name__ == '__main__':
    path = '.\\data\\ex2data2.txt'
    data = load_data(path)

    # 绘制不同degree下的cost函数曲线
    plt.figure(figsize=(15, 6))
    plt.subplot(1, 2, 1)
    for degree in range(1, 7):
        X, y = deal_data(degree, data)

        lamda = 1
        num_iterations = 3000
        learning_rate = 0.1
        theta, costs = gradient_descent(X, y, learning_rate, num_iterations, lamda)

        # 绘制cost函数曲线
        plt.plot(costs, label=f'Degree {degree}')

    plt.xlabel('Iterations')
    plt.ylabel('Cost')
    plt.title('Cost Functions for Different Degrees')
    plt.legend()

    # 绘制不同degree下的计算精度柱状图
    plt.subplot(1, 2, 2)
    accuracy_list = []
    thetas = []
    for degree in range(1, 7):
        X, y = deal_data(degree, data)
        theta, _ = gradient_descent(X, y, learning_rate, num_iterations, lamda)
        thetas.append(theta)
        predictions = predict(theta, X)
        accuracy = compute_accuracy(predictions, y)
        accuracy_list.append(accuracy)

    plt.bar(range(1, 7), accuracy_list, color='skyblue')
    plt.xlabel('Degree')
    plt.ylabel('Accuracy (%)')
    plt.title('Accuracy for Different Degrees')

    plt.tight_layout()
    plt.show()



运行结果

说明

  • 8
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值