1 什么是逻辑回归
逻辑回归:解决分类问题
回归问题怎么解决分类问题?
将样本的特征和样本发生的概率联系起来,概率是一个数.
2 逻辑回归的损失函数
3 实现逻辑回归算法
在playML文件中添加LogisticRegression.py文件,实现逻辑回归算法:
import numpy as np
from .metrics import accuracy_score
class LogisticRegression:
def __init__(self):
"""初始化LogisticRegression模型"""
self.coef_ = None
self.interception_ = None
self._theta = None
def _sigmoid(self, t):
return 1 / (1 + np.exp(-t))
def fit(self, x_train, y_train, eta = 0.01, n_iter = 1e3):
"""根据x_train和y_train,使用梯度下降法训练LogisticRegression模型"""
assert x_train.shape[0] == y_train.shape[0], 'the size of x_train must be equal to y_train'
def J(theta, x_b, y): # 计算损失函数
y_hat = self._sigmoid(x_b.dot(theta))
try:
return - np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat)) / len(y)
except:
return float('inf')
def dJ(theta, x_b, y): # 求梯度
return x_b.T.dot(self._sigmoid(x_b.dot(theta)) - y) / len(x_b)
def gradient_descent(x_b, y, initial_theta, eta, epsilon=1e-8):
theta = initial_theta # 给定起始点
while True:
gradient = dJ(theta, x_b, y) # 计算梯度
last_theta = theta
theta = theta - eta * gradient # 更新theta值
if abs(J(theta, x_b, y) - J(last_theta, x_b, y)) < epsilon:
break
return theta
x_b = np.hstack([np.ones((len(x_train), 1)), x_train])
initial_theta = np.zeros(x_b.shape[1])
self._theta = gradient_descent(x_b, y_train, initial_theta, eta)
self.interception_ = self._theta[0]
self.coef_ = self._theta[1:]
return self
def predict_proba(self, x_predict):
"""根据x_predict,计算返回表示结果概率的向量"""
assert self.interception_ is not None and self.coef_ is not None, \
'must fit_normal before predict'
assert x_predict.shape[1] == len(self.coef_), \
'the size of x_predict must be equal to self.coef_'
x_b = np.hstack([np.ones((len(x_predict), 1)), x_predict])
return self._sigmoid(x_b.dot(self._theta))
def predict(self, x_predict):
"""根据x_predict,计算返回表示结果的向量"""
assert self.interception_ is not None and self.coef_ is not None, \
'must fit_normal before predict'
assert x_predict.shape[1] == len(self.coef_), \
'the size of x_predict must be equal to self.coef_'
proba = self.predict_proba(x_predict)
return np.array(proba >= 0.5, dtype = 'int') # True变为1, False变为0
def score(self, x_test, y_test):
"""根据测试数据集x_test,y_test,确定当前模型的准确度"""
y_predict = self.predict(x_test)
return accuracy_score(y_test, y_predict)
def __repr__(self):
return 'LogisticRegression()'
在jupyter notebook中输入:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
iris = datasets.load_iris()
x = iris.data
y = iris.target
# 逻辑回归只能解决二分类,因此只采用两种样本,为了可视化,只采用前两列数据
x = x[y < 2, :2]
y = y[y < 2]
print(x.shape)
print(y.shape)
>>>(100, 2)
>>>(100,)
# 绘制数据
plt.scatter(x[y == 0, 0], x[y == 0, 1], color = 'b')
plt.scatter(x[y == 1, 0], x[y == 1, 1], color = 'r')
plt.show()
输出:
使用逻辑回归:
from playML.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y)
from playML.LogisticRegression import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(x_train, y_train)
>>>LogisticRegression()
log_reg.score(x_test, y_test)
>>>1.0
# 查看x_test中数据对应的分类概率
log_reg.predict_proba(x_test)
>>>array([1.16344909e-04, 9.95353580e-01, 3.69126011e-02, 9.99847407e-01,
9.62006327e-01, 9.99764277e-01, 9.98174748e-01, 3.39415385e-02,
5.84672286e-03, 9.83403330e-01, 4.17805392e-05, 6.37474205e-03,
9.99999802e-01, 9.99999537e-01, 5.83783203e-04, 8.21559266e-05,
9.92345362e-01, 9.98593427e-01, 9.99972132e-01, 6.36809282e-04])
4 在逻辑回归中使用多项式特征
import numpy as np
import matplotlib.pyplot as plt
x = np.random.normal(0, 1, size = (200, 2))
y = np.array(x[:, 0] ** 2 + x[:, 1] ** 2 < 1.5, dtype = 'int')
plt.scatter(x[y == 0, 0], x[y == 0, 1])
plt.scatter(x[y == 1, 0], x[y == 1, 1])
plt.show()
输出:
使用逻辑回归:
from playML.LogisticRegression import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(x, y)
>>>LogisticRegression()
log_reg.score(x, y)
>>>0.585
在逻辑回归中添加多项式特征:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
def PolynomialLogisticRegression(degree):
return Pipeline([
('poly', PolynomialFeatures(degree = degree)),
('std_scaler', StandardScaler()),
('log_reg', LogisticRegression())
])
poly_log_reg = PolynomialLogisticRegression(degree = 2)
poly_log_reg.fit(x, y)
>>>Pipeline(memory=None,
steps=[('poly',
PolynomialFeatures(degree=2, include_bias=True,
interaction_only=False, order='C')),
('std_scaler',
StandardScaler(copy=True, with_mean=True, with_std=True)),
('log_reg', LogisticRegression())],
verbose=False)
poly_log_reg.score(x, y)
>>>1.0
5 scikit-learn中的逻辑回归
6 OvR与OvO