回顾1
在分类问题中,你要预测的变量 𝑦 是离散的值,我们将学习一种叫做逻辑回归 (Logistic Regression) 的算法,这是目前最流行使用最广泛的一种学习算法。
与之前线性回归不同的是 这里的y是0或1,是一种概率
边界曲线:
作业
题目概述:在训练的初始阶段,我们将要构建一个逻辑回归模型来预测,某个学生是否被大学录取。 设想你是大学相关部分的管理者,想通过申请学生两次测试的评分,来决定他们是否被录取。 现在你拥有之前申请学生的可以用于训练逻辑回归的训练样本集。对于每一个训练样本,你有他们两次测试的评分和最后是被录取的结果。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.optimize as opt
path ='ex2data1.txt'
data = pd.read_csv(path, header=None, names=['Exam 1', 'Exam 2', 'Admitted'])
#print(data.head())
#画散点图
positive = data[data['Admitted'].isin([1])]#isin 来判断Admitted为1的
negative = data[data['Admitted'].isin([0])]# 来判断Admitted为0的
fig, ax = plt.subplots(figsize=(12,8))
#fig代表绘图窗口(Figure),ax代表这个绘图窗口上的坐标系(axis)
#figsize用来设置图形的大小,a为图形的宽, b为图形的高,单位为英寸
ax.scatter(positive['Exam 1'], positive['Exam 2'], s=50, c='b', marker='o', label='Admitted')
ax.scatter(negative['Exam 1'], negative['Exam 2'], s=50, c='r', marker='x', label='Not Admitted')
ax.legend()
ax.set_xlabel('Exam 1 Score')#设置横纵坐标名
ax.set_ylabel('Exam 2 Score')
#sigmoid()函数,即h(x)
def sigmoid(z):
return 1 / (1 + np.exp(-z))
#代价函数cost
def cost(theta, X, y):
theta = np.matrix(theta)
X = np.matrix(X)
y = np.matrix(y)
first = np.multiply(-y, np.log(sigmoid(X * theta.T)))
second = np.multiply((1 - y), np.log(1 - sigmoid(X * theta.T)))
return np.sum(first - second) / (len(X))
data.insert(0, 'Ones', 1)
#初始化
cols = data.shape[1]
X = data.iloc[:,0:cols-1]
y = data.iloc[:,cols-1:cols]
theta = np.zeros(3)
X = np.array(X.values)
y = np.array(y.values)
#print(X.shape, theta.shape, y.shape)
#print(cost(theta, X, y))
#梯度下降
def gradient(theta, X, y):
theta = np.matrix(theta)
X = np.matrix(X)
y = np.matrix(y)
parameters = int(theta.ravel().shape[1])
grad = np.zeros(parameters)
error = sigmoid(X * theta.T) - y
for i in range(parameters):
term = np.multiply(error, X[:, i])
grad[i] = np.sum(term) / len(X)
return grad
#print(gradient(theta, X, y))
'''
在此前的线性回归中,我们自己写代码实现的梯度下降。
当时我们写了一个代价函数、计算了他的梯度,然后对他执行了梯度下降的步骤。
这次,我们不自己写代码实现梯度下降,我们会调用一个已有的库。
这就是说,我们不用自己定义迭代次数和步长,功能会直接告诉我们最优解
Python中,我们可以用scipy.optimize.fmin_tnc
'''
result = opt.fmin_tnc(func=cost, x0=theta, fprime=gradient, args=(X, y))
#func:优化的目标函数,x0
:初值,fprime:提供优化函数func的梯度函数,
#args:元组,是传递给优化函数的参数
#print(result)
#print(cost(result[0], X, y))
#决策边界 画图
plotting_x1 = np.linspace(30, 100, 100)#x
plotting_h1 = ( - result[0][0] - result[0][1] * plotting_x1) / result[0][2]#y
fig, ax = plt.subplots(figsize=(12,8))
ax.plot(plotting_x1, plotting_h1, 'y', label='Prediction')
ax.scatter(positive['Exam 1'], positive['Exam 2'], s=50, c='b', marker='o', label='Admitted')
ax.scatter(negative['Exam 1'], negative['Exam 2'], s=50, c='r', marker='x', label='Not Admitted')
ax.legend()
ax.set_xlabel('Exam 1 Score')
ax.set_ylabel('Exam 2 Score')
plt.show()
#print(plt.show())
正则化
在训练的第二部分,我们将实现加入正则项提升逻辑回归算法。 设想你是工厂的生产主管,你有一些芯片在两次测试中的测试结果,测试结果决定是否芯片要被接受或抛弃。你有一些历史数据,帮助你构建一个逻辑回归模型。
回顾2
作业
边界线实现方法:由于是高次函数,取一些散点(x1,x2,各1000个)代入公式判断是否约得0(小于某个值)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.optimize as opt
def cost(theta, X, y):
theta = np.matrix(theta)
X = np.matrix(X)
y = np.matrix(y)
first = np.multiply(-y, np.log(sigmoid(X * theta.T)))
second = np.multiply((1 - y), np.log(1 - sigmoid(X * theta.T)))
return np.sum(first - second) / (len(X))
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def gradient(theta, X, y):
theta = np.matrix(theta)
X = np.matrix(X)
y = np.matrix(y)
parameters = int(theta.ravel().shape[1])
grad = np.zeros(parameters)
error = sigmoid(X * theta.T) - y
for i in range(parameters):
term = np.multiply(error, X[:, i])
grad[i] = np.sum(term) / len(X)
return grad
path ='ex2data2.txt'
data_init = pd.read_csv(path, header=None, names=['Test 1', 'Test 2', 'Accepted'])
#print(data_init.head())
#画图
positive2 = data_init[data_init['Accepted'].isin([1])]
negative2 = data_init[data_init['Accepted'].isin([0])]
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(positive2['Test 1'], positive2['Test 2'], s=50, c='b', marker='o', label='Accepted')
ax.scatter(negative2['Test 1'], negative2['Test 2'], s=50, c='r', marker='x', label='Rejected')
ax.legend()
ax.set_xlabel('Test 1 Score')
ax.set_ylabel('Test 2 Score')
#print(plt.show())
#特征映射
power = 6
data2 = data_init
x1 = data2['Test 1']
x2 = data2['Test 2']
for i in np.arange(power + 1):
for p in np.arange(i + 1):
data2["f{}{}".format(i - p, p)] = np.power(x1, i - p)*np.power(x2, p)
data2.drop('Test 1', axis=1, inplace=True)
data2.drop('Test 2', axis=1, inplace=True)
#drop,删除行、删除列,默认行:axis = 1,删除列,删除名为'Test 1'这列,inplace=True,原数组名对应的内存值直接改变
#print(data2.head())
#实现正则化的代价函数
def costReg(theta, X, y, learningRate):
X=np.matrix(X)
y=np.matrix(y)
theta=np.matrix(theta)
reg=(learningRate/(2*len(X)))*np.sum(np.power(theta[:,1:theta.shape[1]],2))#不要theta0
return cost(theta, X, y)+reg
#print(costReg(theta2, X2, y2, learningRate))
#实现正则化的梯度函数
def gradientReg(theta, X, y, learningRate):
theta = np.matrix(theta)
X = np.matrix(X)
y = np.matrix(y)
parameters = int(theta.ravel().shape[1])
grad = np.zeros(parameters)
error = sigmoid(X * theta.T) - y
for i in range(parameters):
term=np.multiply(error, X[:, i])
if (i==0):
grad[i] = np.sum(term) / len(X)
else:
grad[i] = (np.sum(term) / len(X)) + ((learningRate / len(X)) * theta[:, i])
return grad
# 初始化
cols = data2.shape[1]
X2 = data2.iloc[:,1:cols]
y2 = data2.iloc[:,0:1]
theta2 = np.zeros(cols-1)
# 进行类型转换
X2 = np.array(X2.values)
y2 = np.array(y2.values)
# λ设为1
learningRate = 1
#print(gradientReg(theta2, X2, y2, learningRate))
#print(costReg(theta2, X2, y2, learningRate))
#用工具库求解参数
result2 = opt.fmin_tnc(func=costReg, x0=theta2, fprime=gradientReg, args=(X2, y2, learningRate))
#print(result2)
#预测:
def predict(theta, X):
probability = sigmoid(X * theta.T)
return [1 if x >= 0.5 else 0 for x in probability]
theta_min = np.matrix(result2[0])
predictions = predict(theta_min, X2)
correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions, y2)]
accuracy = (sum(map(int, correct)) % len(correct))
#print ('accuracy = {0}%'.format(accuracy))
#边界线
degree = 6
def hfunc2(theta, x1, x2):
temp = theta[0][0]
place = 0
for i in range(1, degree+1):
for j in range(0, i+1):
temp+= np.power(x1, i-j) * np.power(x2, j) * theta[0][place+1]
place+=1
return temp
def find_decision_boundary(theta):
t1 = np.linspace(-1, 1.5, 1000)
t2 = np.linspace(-1, 1.5, 1000)
cordinates = [(x, y) for x in t1 for y in t2]#x,y都变成1000的平方
x_cord, y_cord = zip(*cordinates)#将x,y分开
h_val = pd.DataFrame({'x1':x_cord, 'x2':y_cord})
h_val['hval'] = hfunc2(theta, h_val['x1'], h_val['x2'])#转化成28维
decision = h_val[np.abs(h_val['hval']) < 2 * 10**-3]
return decision.x1, decision.x2
#画曲线(边界线)
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(positive2['Test 1'], positive2['Test 2'], s=50, c='b', marker='o', label='Accepted')
ax.scatter(negative2['Test 1'], negative2['Test 2'], s=50, c='r', marker='x', label='Rejected')
ax.set_xlabel('Test 1 Score')
ax.set_ylabel('Test 2 Score')
x, y = find_decision_boundary(result2)
plt.scatter(x, y, c='y', s=10, label='Prediction')
ax.legend()
print(plt.show())