目录
一、决策边界
逻辑回归本质上是在我们的特征平面中找到一个直线,这条直线来将所有的样本对应的分成两个类比,而这条分界线也被称之为决策边界。
代码示例:
import numpy as np
import matplotlib.pyplot as plt
#生产随机样本
np.random.seed(666)
X = np.random.normal(0, 1, size=(200, 2))
y = np.array((X[:,0]**2+X[:,1]**2)<1.5, dtype='int')
#使用逻辑回归
from mySklearn.LogisticRegression import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(X,y)
#log_reg.score(X, y)
#绘制决策边界的函数
def plot_decision_boundary(model, axis):
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1]-axis[0])*100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3]-axis[2])*100)).reshape(-1, 1),
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
from matplotlib.colors import ListedColormap
custom_cmap = ListedColormap(['#EF9A9A','#FFF59D','#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)
#调用
plot_decision_boundary(log_reg, axis=[-4, 4, -4, 4])
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()
运行结果:
可以看出有非常多的错误分类,这也导致了我们分类准确度很低的缘故。
代码示例:
#添加逻辑回归的多项式项
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
#管道
def PolynomialLogisticRegression(degree):
return Pipeline([
('poly', PolynomialFeatures(degree=degree)),
('std_scaler', StandardScaler()),
('log_reg', LogisticRegression())
])
poly_log_reg = PolynomialLogisticRegression(degree=2)
poly_log_reg.fit(X, y)
plot_decision_boundary(poly_log_reg, axis=[-4, 4, -4, 4])
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()
运行结果:
现在的决策边界是一个圆形,能够很好的对非线性数据进行划分。
二、逻辑回归中使用正则化
import numpy as np
import matplotlib.pyplot as plt
#生产随机样本
np.random.seed(666)
X = np.random.normal(0, 1, size=(200, 2))
y = np.array((X[:,0]**2+X[:,1]**2)<1.5, dtype='int')
#生成噪音
for _ in range(20):
y[np.random.randint(200)] = 1
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
#正则化管道
def PolynomialLogisticRegression(degree, C, penalty,solver):
return Pipeline([
('poly', PolynomialFeatures(degree=degree)),
('std_scaler', StandardScaler()),
('log_reg', LogisticRegression(C=C, penalty=penalty,solver=solver))
])
poly_log_reg = PolynomialLogisticRegression(degree=20, C=0.1, penalty='l1',solver='liblinear') #传入正则化项C,选择l1正则化,求解器为liblinear
poly_log_reg.fit(X_train, y_train)
plot_decision_boundary(poly_log_reg, axis=[-4, 4, -4, 4])
plt.scatter(X[y==0,0], X[y==0,1])
plt.scatter(X[y==1,0], X[y==1,1])
plt.show()
运行结果:
三、逻辑回归解决多分类问题
1、OvR
假设现在样本一共有四个类别,那么我们将红色的点作为一个类别(one),剩下其他颜色的点统称为其他类别(rest),即现在将四分类问题转换成二分类问题。以此类推,则可以借助逻辑回归中二分类算法解决,最终将n个类别进行n次分类,最终分类得分(概率)最高的。
2、OvO
同样,如果我们将上面四个类别的样本分别形成两两一对,进而形成了C(n,2)=6个二分类问题,针对这C(n,2)个类别结果选择赢数最高的分类,进而判定该样本在哪个类别中。
代码示例:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(multi_class="multinomial", solver="newton-cg") #调用OvO方式
log_reg.fit(X_train, y_train)
#log_reg.score(X_test, y_test)
#使用OvR
from sklearn.multiclass import OneVsRestClassifier
ovr = OneVsRestClassifier(log_reg)
ovr.fit(X_train, y_train)
#ovr.score(X_test, y_test)
#使用OvO
from sklearn.multiclass import OneVsOneClassifier
ovo = OneVsOneClassifier(log_reg)
ovo.fit(X_train, y_train)
#ovo.score(X_test, y_test)