- 逻辑回归
给的数据集有两个特征变量。
设想你是大学相关部分的管理者,想通过申请学生两次测试的评分,来决定他们是否被录取。现在你拥有之前申请学生的可以用于训练逻辑回归的训练样本集。对于每一个训练样本,你有他们两次测试的评分和最后是被录取的结果。由此建立逻辑回归分类器。
注: https://github.com/fengdu78/Coursera-ML-AndrewNg-Notes/tree/master/code
github上并没有使用梯度下降来优化参数,应该是使用的是其他的优化算法(SciPy’s truncated newton(TNC))。
用梯度下降法优化参数,并不能很好的降低代价函数,相比于其他优化算法。可能是对于这个数据集来说,需要迭代相当多的次数。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.optimize as opt
path = 'ex2data1.txt' #相对路径
data = pd.read_csv(path, header=None, names=['First', 'Second', 'Admission'])
'''观察数据'''
#positive = data[data['Admission'].isin([1])]
#negative = data[data['Admission'].isin([0])]
#fig, ax = plt.subplots(figsize=(12, 8))
#ax.scatter(positive['First'], positive['Second'], s=50, c='b', marker='o', label='Admission')
#ax.scatter(negative['First'], negative['Second'], s=50, c='r', marker='x', label='Not Admission')
#ax.legend()
#ax.set_xlabel('First Score')
#ax.set_ylabel('Second Score')
#plt.show()
'''操作数据,以便后续处理'''
#data['First'] = (data['First'] - data['First'].mean())/data['First'].std()
#data['Second'] = (data['Second'] - data['Second'.mean()])/data['Second'].std() #特征缩放,
data.insert(0, 'Ones', 1)
cols = data.shape[1] #data的列数
X = data.iloc[:, 0:cols-1] #特征变量
y = data.iloc[:, cols-1:cols] #目标变量
X = np.matrix(X.values) #转为矩阵
y = np.matrix(y.values)
theta = np.matrix(np.array([0 for i in range(X.shape[1])])) #初始化θ的值
def sigmoid(x):
'''sigmoid函数'''
return 1 / (1 + np.exp(-x))
def computeCost(X, y, theta):
'''代价函数'''
h = sigmoid(X * theta.T)
inner = y.T*np.log(h) + (1-y).T*np.log(1-h)
return -(np.sum(inner)/X.shape[0])
alpha = 0.001
iters = 150000
def gradienDescent(X, y, theta, alpha, iters):
cost = np.zeros(iters)
for i in range(iters):
h = sigmoid(X * theta.T) - y
innre = X.T * h
theta = theta - (alpha/len(X)) * innre.T
cost[i] = computeCost(X, y, theta)
return theta, cost
theta, cost = gradienDescent(X, y, theta, alpha, iters)
#print(cost[-1])
'''观察代价函数的迭代'''
#fig, ax = plt.subplots(figsize=(12, 8))
#ax.plot(np.arange(iters), cost, 'r')
#ax.set_xlabel('Iterations')
#ax.set_ylabel('Cost')
#ax.set_title('Error vs. Ttainging Epoch')
#plt.show()
'''检测逻辑回归分类器的精度'''
probability = sigmoid(X * theta.T)
predictions = [1 if x >= 0.5 else 0 for x in probability]
correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions, y)]
accuracy = (sum(map(int, correct)) % len(correct))
print('accuracy = {0}%'.format(accuracy))
- 逻辑回归的正则化
import numpy as np
import pandas as pd
import scipy.optimize as opt
import matplotlib.pyplot as plt
path = 'ex2data2.txt'
data2 = pd.read_csv(path, header=None, names=['Test1', 'Test2', 'Accepted'])
degree = 5
x1 = data2['Test1']
x2 = data2['Test2']
data2.insert(3, 'Ones', 1)
for i in range(1, degree):
for j in range(i):
data2['F' + str(i) + str(j)] = np.power(x1, i- j) * np.power(x2, j)
data2.drop('Test1', axis=1, inplace=True)
data2.drop('Test2', axis=1, inplace=True)
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def cost(theta, X, y, learningRate):
theta = np.matrix(theta)
X = np.matrix(X)
y = np.matrix(y)
first = np.multiply(-y, np.log(sigmoid(X * theta.T)))
second = np.multiply((1 - y), np.log(1 - sigmoid(X * theta.T)))
reg = (learningRate / (2 * len(X))) * np.sum(np.power(theta[:,1:theta.shape[1]], 2))
return np.sum(first - second) / len(X) + reg
def gradientReg(theta, X, y, learningRate):
theta = np.matrix(theta)
X = np.matrix(X)
y = np.matrix(y)
parameters = int(theta.ravel().shape[1])
grad = np.zeros(parameters)
error = sigmoid(X * theta.T) - y
for i in range(parameters):
term = np.multiply(error, X[:, i])
if (i == 0):
grad[i] = np.sum(term) / len(X)
else:
grad[i] = (np.sum(term) / len(X)) + ((learningRate / len(X)) * theta[:, i])
return grad
# set X and y (remember from above that we moved the label to column 0)
cols = data2.shape[1]
X2 = data2.iloc[:,1:cols]
y2 = data2.iloc[:,0:1]
# convert to numpy arrays and initalize the parameter array theta
X2 = np.array(X2.values)
y2 = np.array(y2.values)
theta2 = np.zeros(11)
learningRate = 1
result2 = opt.fmin_tnc(func=cost, x0=theta2, fprime=gradientReg, args=(X2, y2, learningRate))
theta = np.matrix(result2[0])
probability = sigmoid(X2 * theta.T)
predictions = [1 if x >= 0.5 else 0 for x in probability]
correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions, y2)]
accuracy = (sum(map(int, correct)) % len(correct))
print('accuracy = {0}%'.format(accuracy))