逻辑回归 总结
概述
逻辑回归 (Logic Regression) 本质上就是线性回归. 虽然逻辑回归被称为回归, 但实际上是一个分类模型, 用作二分类问题. 逻辑回归的决策边界可以是非线性的.
Sigmoid 函数
Sigmoid 函数将任意的输入映射到了 [0, 1] 区间. 我们在线性回归中可以得到一个预测值, 再将该值映射到 Sigmoid 函数中. 这样我们就完成了由值到概率的转换, 即分类任务.
预测函数:
分类任务:
逻辑回归实现
import numpy as np
import numpy.random
import pandas as pd
import time
from matplotlib import pyplot as plt
data = pd.read_csv("LogiReg_data.txt", header=None, names=["Exam 1", "Exam 2", "Admitted"])
print(data.head())
# sigmoid 函数
def sigmoid(z):
return 1 / (1 + np.exp(-z))
nums = np.arange(-10, 10, step=1)
fig, ax = plt.subplots(figsize=(12, 4))
ax.plot(nums, sigmoid(nums), "r")
# plt.show()
def model(x, theta):
return sigmoid(np.dot(x, theta.T))
data.insert(0, "Ones", 1)
orig_data = data.values
print(orig_data)
cols = orig_data.shape[1] # 4
x = orig_data[:, 0:cols - 1]
y = orig_data[:, -1:]
theta = np.zeros([1, 3])
print(x[:5], y[:5])
# 损失函数
def cost(x, y, theta):
left = np.multiply(-y, np.log(model(x, theta)))
right = np.multiply(1 - y, np.log(1 - model(x, theta)))
return np.sum(left - right) / len(x)
cost_value = cost(x, y, theta)
print("平均损失: ", cost_value)
# 计算梯度
def gradient(x, y, theta):
grad = np.zeros(theta.shape)
error = (model(x, theta) - y).ravel()
for j in range(len(theta.ravel())):
term = np.multiply(error, x[:, j])
grad[0, j] = np.sum(term) / len(x)
return grad
# 比较3种不同梯度下降方法
STOP_ITER = 0
STOP_COST = 1
STOP_GRAD = 2
def stopCriterion(type, value, threshold):
# 设定三种不同的停止策略
if type == STOP_ITER:
return value > threshold
elif type == STOP_COST:
return abs(value[-1] - value[-2]) < threshold
elif type == STOP_GRAD:
return np.linalg.norm(value) < threshold
# 洗牌
def shuffleData(data):
np.random.shuffle(data)
cols = data.shape[1]
x = data[:, 0:cols - 1]
y = data[:, -1:]
return x, y
def descent(data, theta, batchSize, stopType, thresh, alpha):
# 梯度下降求解
init_time = time.time()
i = 0 # 迭代次数
k = 0 # batch
x, y = shuffleData(data)
grad = np.zeros(theta.shape)
costs = [cost(x, y, theta)]
while True:
grad = gradient(x[k:k + batchSize], y[k:k + batchSize], theta)
k += batchSize # 取batch数量个数据
if k >= n:
k = 0
x, y = shuffleData(data) # 重新写法
theta = theta - alpha * grad # 参数更新
costs.append(cost(x,y,theta)) # 计算新的损失
i += 1
if stopType == STOP_ITER:
value = i
elif stopType == STOP_COST:
value = costs
elif stopType == STOP_GRAD:
value = grad
if stopCriterion(stopType, value, thresh): break
return theta, i - 1, costs, grad, time.time() - init_time
def runExpe(data, theta, batchSize, stopType, thresh, alpha):
theta, iter, costs, grad, dur = descent(data, theta, batchSize, stopType, thresh, alpha)
name = "Original" if (data[:, 1] > 2).sum() > 1 else "Scaled"
name += " data - learning rate: {} - ".format(alpha)
if batchSize == n:
strDescType = "Gradient"
elif batchSize == 1:
strDescType = "Stochastic"
else:
strDescType = "Mini-batch ({})".format(batchSize)
name += strDescType + " descent - Stop: "
if stopType == STOP_ITER:
strStop = "{} iterations".format(thresh)
elif stopType == STOP_COST:
strStop = "costs change < {}".format(thresh)
else:
strStop = "gradient norm < {}".format(thresh)
name += strStop
print("***{}\nTheta: {} - Iter: {} - Last cost: {:03.2f} - Duration: {:03.2f}s".format(
name, theta, iter, costs[-1], dur))
fig, ax = plt.subplots(figsize=(12, 4))
ax.plot(np.arange(len(costs)), costs, 'r')
ax.set_xlabel('Iterations')
ax.set_ylabel('Cost')
ax.set_title(name.upper() + ' - Error vs. Iteration')
return theta
n=100
runExpe(orig_data, theta, n, STOP_ITER, thresh=5000, alpha=0.000001)
案例一
sklearn 中 digits 手写数据集预测.
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression
digit = load_digits()
lr = LogisticRegression(C=1.0, penalty="l1", tol=0.01)
# 进行数据分割
x_train, x_test, y_train, y_test = train_test_split(digit.data, digit.target, test_size=0.33, random_state=42)
lr.fit(x_train, y_train)
y_predict = lr.predict(x_test)
# 预测准确率
print("预测的值为: ", y_predict)
print("在测试集上的准确率: ", lr.score(x_test, y_test))
输出结果:
预测的值为: [6 9 3 7 2 1 5 2 5 2 1 8 4 0 4 2 3 7 8 8 4 3 9 7 5 6 3 5 6 3 4 9 1 4 4 6 9
4 7 6 6 9 1 3 6 1 3 0 6 5 5 1 9 5 6 0 9 0 0 1 0 4 5 2 4 5 7 0 7 5 9 5 5 4
7 0 4 5 5 9 9 0 2 3 8 0 6 4 4 9 1 2 8 3 5 2 9 0 4 4 4 3 5 3 1 3 5 9 4 2 7
7 4 4 1 9 2 7 8 7 2 6 9 4 0 7 2 7 5 8 7 5 7 9 0 6 6 4 2 8 0 9 4 6 9 9 6 9
0 5 5 6 6 0 6 4 2 9 3 8 7 2 9 0 4 5 8 6 5 8 9 8 4 2 1 3 7 7 2 2 3 9 8 0 3
2 2 5 6 9 9 4 1 5 4 2 3 6 4 8 5 9 5 7 8 9 4 8 1 5 4 4 9 6 1 8 6 0 4 5 2 7
1 6 4 5 6 0 3 2 3 6 7 1 5 1 4 7 6 5 1 5 5 1 4 2 8 8 9 8 7 6 2 2 2 3 4 8 8
3 6 0 9 7 7 0 1 0 4 5 1 5 3 6 0 4 1 0 0 3 6 5 9 7 3 5 5 9 9 8 5 3 3 2 0 5
8 3 4 0 2 4 6 4 3 4 5 0 5 2 1 3 1 4 1 1 7 0 1 5 2 1 2 8 7 0 6 4 8 8 5 1 8
4 5 8 7 9 8 6 0 6 2 0 7 9 8 9 5 2 7 7 1 8 7 4 3 8 3 5 6 0 0 3 0 5 0 0 4 1
2 8 4 5 9 6 3 1 8 8 4 2 3 8 9 8 8 5 0 6 3 3 7 1 6 4 1 2 1 1 6 4 7 4 8 3 4
0 5 1 9 4 5 7 6 3 7 0 5 9 7 5 9 7 4 2 1 9 0 7 5 8 3 6 3 9 6 9 5 0 1 5 5 8
3 3 6 2 6 5 5 2 0 8 7 3 7 0 2 2 3 5 8 7 3 6 5 9 9 2 1 6 3 0 7 1 1 9 6 1 1
0 0 2 9 3 8 9 3 7 7 1 3 5 4 6 8 2 1 8 8 7 6 9 2 0 4 4 8 8 7 1 3 1 7 1 3 5
1 7 0 0 2 2 6 9 4 8 9 0 6 7 7 9 5 4 7 0 7 6 8 7 1 4 6 2 8 7 5 9 0 3 9 6 6
1 9 1 2 9 8 9 7 4 8 5 5 9 7 7 6 8 1 3 5 7 9 5 5 2 1 1 2 2 4 8 7 5 8 8 9 4
9 0]
在测试集上的准确率: 0.9646464646464646