吴恩达机器学习系列作业
逻辑回归
- 题目描述
在这部分的练习中,你将建立一个逻辑回归模型来预测一个学生是否能进入大学。假设你是一所大学的行政管理人员,你想根据两门考试的结果,来决定每个申请人是否被录取。你有以前申请人的历史数据,可以将其用作逻辑回归训练集。对于每一个训练样本,你有申请人两次测评的分数以及录取的结果。为了完成这个预测任务,我们准备构建一个可以基于两次测试评分来评估录取可能性的分类模型。
- 代码
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.optimize as opt
from sklearn.metrics import classification_report # 这个包是评价报告
def sigmoid(theta, x):
"""S型函数"""
z = hypothesis(theta, x)
return 1.0/(1+np.exp(-z))
def hypothesis(theta, x):
"""假设函数"""
return np.dot(x, theta)
def cost_function(theta, x, y):
'''代价函数'''
return (-1/len(x))*np.sum((y.T@np.log(sigmoid(theta, x))+(1-y.T)@ np.log(1 - sigmoid(theta, x))))
def gradient(theta, x, y):
"""梯度下降函数"""
return (1 / len(x)) * (sigmoid(theta, x).T - y.T)@ x
def standardize_data(x):
"""特征缩放"""
average_value = np.mean(x, axis=0) # 平均值
standard_deviation = np.std(x, axis=0) # 标准偏差
x = (x-average_value)/standard_deviation
return x, average_value, standard_deviation
def predict(x, theta):
prob = sigmoid(theta, x)
return (prob >= 0.5).astype(int)
if __name__=='__main__':
# 读入数据
data = pd.read_csv("ex2data1.txt", sep=',', header=None, names=["fisrt", "second", "result"])
print(data.describe())
rows = data.shape[0]
cols = data.shape[1]
positive = data[data.result.isin([1])]
negative = data[data.result.isin([0])]
plt.figure(1, figsize=(8, 6))
plt.scatter(positive.iloc[:, 0:1], positive.iloc[:, 1:2], color='r', marker='o', label='Admitted')
plt.scatter(negative.iloc[:, 0:1], negative.iloc[:, 1:2], color='g', marker='*', label='Not admitted')
plt.xlabel("Exam 1 score")
plt.ylabel("Exam 2 score")
plt.legend(loc='best')
theta = np.array([[0,0,0]]).T
x = np.array(data.iloc[0:, 0:2])
# x, avv, std = standardize_data(x)
# 此处先不进行特征缩放,目的:方便画图
o = np.ones(data.shape[0])
xi = np.insert(x, 0, o, axis=1)
yi = np.array(data.iloc[0:, 2:3])
final_theta = opt.fmin_tnc(func=cost_function, x0=theta, fprime=gradient, args=(xi,yi))
print(final_theta)
xx = np.arange(30, 110, 10)
yy = (-final_theta[0][0] - final_theta[0][1] * xx) / final_theta[0][2]
plt.plot(xx, yy)
plt.show()
y_pred = predict(xi, final_theta[0].T)
print(classification_report(yi, y_pred)
- 结果
正则化逻辑回归
- 题目描述
在训练的第二部分,我们将要通过加入正则项提升逻辑回归算法。简而言之,正则化是成本函数中的一个术语,它使算法更倾向于“更简单”的模型(在这种情况下,模型将更小的系数)。这个理论助于减少过拟合,提高模型的泛化能力。这样,我们开始吧。
设想你是工厂的生产主管,你有一些芯片在两次测试中的测试结果。对于这两次测试,你想决定是否芯片要被接受或抛弃。为了帮助你做出艰难的决定,你拥有过去芯片的测试数据集,从其中你可以构建一个逻辑回归模型。
- 代码
'''题目描述:预测来自制造工厂的微芯片是否通过质量保证(QA)'''
# 注:此文件为使用 正则化 技术得到的结果
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.optimize as opt
from sklearn.metrics import classification_report # 这个包是评价报告
def plot_data():
'''画出散点图'''
data2 = pd.read_csv('ex2data2.txt', header=None, names=['Test1', 'Test2', 'Accepted'])
positive = data2[data2.Accepted.isin([1])]
negative = data2[data2.Accepted.isin([0])]
plt.figure(1, figsize=(8, 6))
plt.scatter(positive.Test1, positive.Test2, c='g', marker='o', label='Accepted')
plt.scatter(negative.Test1, negative.Test2, c='r', marker='x', label='Rejected')
plt.legend(loc='best')
plt.xlabel('Test 1 Score')
plt.ylabel('Test 2 Score')
plt.show()
def init_data(degree):
"""初始化数据"""
data = pd.read_csv('ex2data2.txt', header=None, names=['Test1', 'Test2', 'Accepted'])
x = np.array(data.iloc[0:, 0:2])
label = np.array(data.iloc[0:, 2:3])
# o = np.ones(data.shape[0])
# x1=np.insert(x, 0, o, axis=1)
# 添加特征量
xx = feature_handler(x, degree)
# 标记矩阵转置,返回特征矩阵和标记矩阵
return xx, label
# 添加特征量
def feature_handler(data, degree):
feature_numbers = (degree + 1) * (degree + 2) / 2
dt = np.zeros((data.shape[0], int(feature_numbers)))
x1 = data[:, 0:1]
x2 = data[:, 1:2]
k=0
for i in range(degree+1):
for j in range(degree+1-i):
dt[:, k:k+1] = np.power(x1, i)*np.power(x2, j)
k = k+1
return dt
def sigmoid(theta, x):
"""S型函数"""
z = hypothesis(theta, x)
return 1.0/(1+np.exp(-z))
def hypothesis(theta, x):
"""假设函数"""
return np.dot(x, theta)
def cost_function(theta, x, y):
'''代价函数'''
return (-1/len(x))*np.sum((y.T@np.log(sigmoid(theta, x))+(1-y.T)@ np.log(1 - sigmoid(theta, x))))
def regularized_cost(theta, x, y, lam=0.5):
'''正则化代价函数'''
theta_j1_to_n = theta[1:]
regularized_term = (lam / (2 * len(x))) * np.power(theta_j1_to_n, 2).sum()
return cost_function(theta, x, y) + regularized_term
def gradient(theta, x, y):
"""梯度下降函数"""
return (1 / len(x)) * (sigmoid(theta, x).T - y.T)@ x
def regularized_gradient(theta, x, y, lam = 0.5):
"""正则化"""
theta_j1_to_n = theta[1:]
regularized_theta = (lam / len(x)) * theta_j1_to_n
regularized_term = np.concatenate([np.array([0]), regularized_theta]) # 空出第一个元素
return gradient(theta, x, y) + regularized_term
# 验证函数
def predict(x, theta):
prob = sigmoid(theta, x)
return (prob >= 0.5).astype(int)
if __name__ == '__main__':
plot_data()
# 预测函数的阶数
degree = 6
# 特征个数 = (阶数+1)*(阶数+2)/2
feature_numbers = (degree + 1) * (degree + 2) / 2
# theta = np.zeros((feature_numbers, 1))
xi, yi = init_data(degree)
theta = np.zeros((int(feature_numbers), 1))
final_theta = opt.fmin_tnc(func=regularized_cost, x0=theta, fprime=regularized_gradient, args=(xi, yi))
print(final_theta[0])
final_theta = final_theta[0]
y_pred = predict(xi, final_theta)
print(classification_report(yi, y_pred))
- 结果