前言
在上一篇博客中,逻辑回归的最佳回归系数做了说明,也通过代码来实现了怎么求解最佳回归系数。下面希望用可视化的方式来进一步理解这个算法。
画决策边界
因为逻辑回归常常是解决二分类的问题,所以我们画的决策边界,说得通俗一点就是用一条直线来将不同的类别分割开来。
- 画样本点及逻辑回归拟合直线
def plot_best_fit(weights):
"""画样本点及逻辑回归拟合直线"""
dataSet, labelSet = load_dataset()
dataArr = array(dataSet)
n = shape(dataArr)[0] # 样本点个数(300)
xcord1 = []
ycord1 = []
xcord2 = []
ycord2 = []
for i in range(n):
if int(labelSet[i]) == 1:
xcord1.append(dataArr[i, 1])
ycord1.append(dataArr[i, 2])
else:
xcord2.append(dataArr[i, 1])
ycord2.append(dataArr[i, 2])
fig = pyplot.figure()
ax = fig.add_subplot(111)
# 1这类用红色方块表示
ax.scatter(xcord1, ycord1, s=60, c="red", marker='s')
# 0这类用绿色圆圈表示
ax.scatter(xcord2, ycord2, s=60, c="green", marker="o")
x = arange(-3.0, 3.0, 0.1)
y = (-weights[0]-weights[1]*x)/weights[2]
ax.plot(x, y)
pyplot.xlabel('X1')
pyplot.ylabel('X2')
pyplot.show()
- 完整代码
from numpy import *
from matplotlib import pyplot
def load_dataset():
"""加载数据集"""
dataSet = [] # 数据
labelSet = [] # 标签
fr = open('lr-testSet.txt')
for line in fr.readlines():
lineArr = line.strip().split()
if len(lineArr) == 1:
continue
# 为了便于后续计算,将数据集第一列值设为1.0
dataSet.append([1.0, float(lineArr[0]), float(lineArr[1])])
labelSet.append(int(lineArr[2]))
return dataSet, labelSet
def sigmoid(inX):
"""sigmoid函数"""
return 1.0 / (1 + exp(-inX))
def grad_ascent(dataSet, labelSet):
"""梯度上升算法"""
dataMatrix = mat(dataSet) # 转成numpy矩阵
labelMatrix = mat(labelSet).transpose() # 先转成numpy矩阵再进行矩阵转置
m, n = shape(dataMatrix)
print("样本数and特征数:", m, n)
alpha = 0.001 # 向目标移动的步长
maxCycles = 500 # 设置的最大的迭代次数
weights = ones((n, 1)) # 表示回归系数,3*1的矩阵,值全为1
for k in range(maxCycles):
h = sigmoid(dataMatrix*weights) # 矩阵相乘 # m*3 X 3*1
error = (labelMatrix - h) # m*1 - m*1
# alpha * dataMatrix.transpose() * error表示在每一列上的一个误差情况
# 得到x1,x2,xn的系数的偏移量
weights = weights + alpha * dataMatrix.transpose() * error
return weights
def plot_best_fit(weights):
"""画样本点及逻辑回归拟合直线"""
dataSet, labelSet = load_dataset()
dataArr = array(dataSet)
n = shape(dataArr)[0] # 样本点个数(300)
xcord1 = []
ycord1 = []
xcord2 = []
ycord2 = []
for i in range(n):
if int(labelSet[i]) == 1:
xcord1.append(dataArr[i, 1])
ycord1.append(dataArr[i, 2])
else:
xcord2.append(dataArr[i, 1])
ycord2.append(dataArr[i, 2])
fig = pyplot.figure()
ax = fig.add_subplot(111)
# 1这类用红色方块表示
ax.scatter(xcord1, ycord1, s=60, c="red", marker='s')
# 0这类用绿色圆圈表示
ax.scatter(xcord2, ycord2, s=60, c="green", marker="o")
x = arange(-3.0, 3.0, 0.1)
y = (-weights[0]-weights[1]*x)/weights[2]
ax.plot(x, y)
pyplot.xlabel('X1')
pyplot.ylabel('X2')
pyplot.show()
# 测试
dataSet, labelSet = load_dataset()
lr_w = grad_ascent(dataSet, labelSet)
print("最佳回归系数:\n", lr_w)
plot_best_fit(lr_w.getA())
对于y = (-weights[0]-weights[1]*x)/weights[2]
的说明:
我们拟合直线表达式:f(x) =
w
0
∗
x
0
w_0*x_0
w0∗x0 +
w
1
∗
x
1
w_1*x_1
w1∗x1 +
w
2
∗
x
2
w_2*x_2
w2∗x2,其中
x
0
x_0
x0设置为1,设置sigmoid函数为0。因此,设定了0 =
w
0
∗
x
0
w_0*x_0
w0∗x0 +
w
1
∗
x
1
w_1*x_1
w1∗x1 +
w
2
∗
x
2
w_2*x_2
w2∗x2,
x
2
x_2
x2对应的是画图时的y,即
w
0
∗
1
w_0*1
w0∗1 +
w
1
∗
x
1
w_1*x_1
w1∗x1 +
w
2
∗
y
w_2*y
w2∗y = 0,所以求解出y = (-
w
0
w_0
w0-
w
1
w_1
w1*
x
x
x) /
w
2
w_2
w2
代码中其他函数的实现说明见上篇博客
可视化结果:
根据结果可以看出,只分错了个别点,效果还是不错。