1. Logistic回归分析
分析对象
假设您是一个大学部门的管理员,您想根据申请人在两次考试中的成绩来确定他们的入学机会。文件 ex2data1.txt 包含申请人在两门考试中的成绩和录取决定。您的任务是构建一个逻辑回归模型,根据这两门考试的分数估计申请人被录取的概率。
计算程序
from collections import OrderedDict
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
def load_data(file_path):
"""
加载数据
"""
data = pd.read_csv(file_path, header=None, names=['test1', 'test2', 'scores'])
return data
def deal_data(data):
"""
提取特征 X 和标签 y
"""
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values.reshape(-1, 1)
# 添加偏置项到 X
X = np.c_[np.ones(X.shape[0]), X]
return X, y
def sigmoid(z):
"""
Sigmoid 激活函数
"""
return 1 / (1 + np.exp(-z))
def gradient_descent(X_b, y, learning_rate, num_iterations):
"""
梯度下降训练逻辑回归模型
"""
# 初始化参数
theta = np.zeros((X_b.shape[1], 1))
# 记录每次迭代的代价
costs = []
for _ in range(num_iterations):
# 计算预测值
predictions = sigmoid(np.dot(X_b, theta))
epsilon = 1e-15
predictions = np.clip(predictions, epsilon, 1 - epsilon)
# 计算代价函数
cost = -np.mean(y * np.log(predictions) + (1 - y) * np.log(1 - predictions))
costs.append(cost)
# 计算梯度
gradient = np.dot(X_b.T, (predictions - y)) / X_b.shape[0]
# 更新参数
theta -= learning_rate * gradient
return theta, costs
def predict(theta, X):
"""
预测输出结果
"""
probability = sigmoid(X.dot(theta))
return [1 if x >= 0.5 else 0 for x in probability]
def evaluate_logistic_regression(X, y, theta):
"""
对决策边界进行可视化
"""
for i in range(0, X.shape[0]):
if y[i, 0] == 1:
plt.scatter(X[i, 1], X[i, 2], marker='+', c='black', label='Admitted')
elif y[i, 0] == 0:
plt.scatter(X[i, 1], X[i, 2], marker='o', c='yellow', label='Not admitted')
plt.xticks(np.arange(30, 110, 10))
plt.yticks(np.arange(30, 110, 10))
plt.xlabel('Exam 1 score')
plt.ylabel('Exam 2 score')
handles, labels = plt.gca().get_legend_handles_labels()
by_label = OrderedDict(zip(labels, handles))
plt.legend(by_label.values(), by_label.keys())
minX = np.min(X[:, 1])
maxX = np.max(X[:, 1])
xx = np.linspace(minX, maxX, 100)
yy = (theta[0][0] + theta[1][0] * xx) / (-theta[2][0])
plt.plot(xx, yy)
plt.show()
if __name__ == '__main__':
path = '.\\data\\ex2data1.txt'
data = load_data(path)
X, y = deal_data(data)
num_iterations = 200000
learning_rate = 0.001
theta, costs = gradient_descent(X, y, learning_rate, num_iterations)
# 输出结果
print(f"Theta: {theta}")
predictions = predict(theta, X)
accuracy = np.mean(predictions == y.flatten()) * 100
print(f'Accuracy: {accuracy}%')
# 绘制代价函数曲线
plt.plot(costs)
plt.xlabel('Iterations')
plt.ylabel('Cost')
plt.title('Cost Function')
plt.show()
evaluate_logistic_regression(X, y, theta)
运行结果
说明
2. Logistic分析-正则化
分析对象
假设您是工厂的产品经理,在两个不同的测试中有一些微芯片的测试结果。从这两项测试中,您想确定该微芯片是否应该被接受或拒绝。文件 ex2data2.txt 中有一个关于过去微芯片测试结果的数据集。您将实现正则化逻辑回归来预测来自制造厂的微芯片是否通过质量保证(QA)。
计算程序
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
def load_data(file_path):
"""
加载数据
"""
data = pd.read_csv(file_path, header=None, names=['test1', 'test2', 'scores'])
return data
def deal_data(degree, data):
"""
从输入数据中提取特征 X 和标签 y,并进行特征映射和归一化处理
"""
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values.reshape(-1, 1)
# 对每个阶数进行特征映射
for j in range(1, degree):
for i in range(0, j):
# 创建新的多项式特征并添加到 X 中
new_feature = np.power(X[:, 0], (j - i)) * np.power(X[:, 1], i)
X = np.c_[X, new_feature]
# 特征缩放 - 归一化
X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
# 添加一列全为1的偏置项到 X
X = np.c_[np.ones(X.shape[0]), X]
return X, y
def sigmoid(z):
"""
Sigmoid 激活函数
"""
return 1 / (1 + np.exp(-z))
def gradient_descent(X_b, y, learning_rate, num_iterations, lamda):
"""
使用梯度下降训练逻辑回归模型,包括正则化项
"""
num_samples, num_features = X_b.shape
# 记录每次迭代的代价
costs = []
theta = np.zeros((X_b.shape[1], 1))
for _ in range(num_iterations):
# 计算预测值
predictions = sigmoid(np.dot(X_b, theta))
epsilon = 1e-15
predictions = np.clip(predictions, epsilon, 1 - epsilon)
# 计算代价函数,包括正则化项
cost = -np.mean(y * np.log(predictions) + (1 - y) * np.log(1 - predictions)) + \
(lamda / (2 * num_samples)) * np.sum(np.square(theta[1:]))
costs.append(cost)
# 计算梯度
gradient = np.dot(X_b.T, (predictions - y)) / num_samples
# 更新参数,包括正则化项的更新
theta -= learning_rate * gradient + (lamda / num_samples) * np.vstack((0, theta[1:]))
return theta, costs
# 预测输出结果
def predict(theta, X):
probability = sigmoid(X.dot(theta))
return [1 if x >= 0.5 else 0 for x in probability]
# 计算分类器的准确度
def compute_accuracy(predictions, y):
correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions, y)]
accuracy = (sum(correct) / len(correct)) * 100
return accuracy
if __name__ == '__main__':
path = '.\\data\\ex2data2.txt'
data = load_data(path)
# 绘制不同degree下的cost函数曲线
plt.figure(figsize=(15, 6))
plt.subplot(1, 2, 1)
for degree in range(1, 7):
X, y = deal_data(degree, data)
lamda = 1
num_iterations = 3000
learning_rate = 0.1
theta, costs = gradient_descent(X, y, learning_rate, num_iterations, lamda)
# 绘制cost函数曲线
plt.plot(costs, label=f'Degree {degree}')
plt.xlabel('Iterations')
plt.ylabel('Cost')
plt.title('Cost Functions for Different Degrees')
plt.legend()
# 绘制不同degree下的计算精度柱状图
plt.subplot(1, 2, 2)
accuracy_list = []
thetas = []
for degree in range(1, 7):
X, y = deal_data(degree, data)
theta, _ = gradient_descent(X, y, learning_rate, num_iterations, lamda)
thetas.append(theta)
predictions = predict(theta, X)
accuracy = compute_accuracy(predictions, y)
accuracy_list.append(accuracy)
plt.bar(range(1, 7), accuracy_list, color='skyblue')
plt.xlabel('Degree')
plt.ylabel('Accuracy (%)')
plt.title('Accuracy for Different Degrees')
plt.tight_layout()
plt.show()