1. 逻辑斯谛回归模型
1.1 逻辑斯谛分布
分布函数:
F
(
x
)
=
P
(
X
≤
x
)
=
1
1
+
e
1
(
x
−
μ
)
/
γ
F(x)=P(X\leq x)=\frac{1}{1+e^{1(x-\mu)/\gamma}}
F(x)=P(X≤x)=1+e1(x−μ)/γ1
密度函数:
f
(
x
)
=
F
′
(
x
)
=
e
−
(
x
−
μ
)
/
γ
γ
(
x
+
e
(
−
(
x
−
μ
)
/
γ
)
2
f(x)=F'(x)=\frac{e^{-(x-\mu)/\gamma}}{\gamma(x+e^(-(x-\mu)/\gamma)^2}
f(x)=F′(x)=γ(x+e(−(x−μ)/γ)2e−(x−μ)/γ
1.2 二项逻辑斯谛回归模型
P ( Y = 1 ∣ x ) = e x p ( w ⋅ x ) 1 + e x p ( w ⋅ x ) P(Y=1|x)=\frac{exp(w\cdot x)}{1+exp(w\cdot x)} P(Y=1∣x)=1+exp(w⋅x)exp(w⋅x)
P ( Y = 0 ∣ x ) = 1 1 + e x p ( w ⋅ x ) P(Y=0|x)=\frac{1}{1+exp(w\cdot x)} P(Y=0∣x)=1+exp(w⋅x)1
逻辑斯谛回归模型输出 Y = 1 Y=1 Y=1的对数几率是输入 x x x的线性函数,通过sigmoid函数将线性函数转换成0到1之间的概率
1.3 模型参数估计
逻辑斯谛回归使用极大似然估计法估计参数模型,对数似然函数是 L ( w ) = ∑ i = 1 N [ y i ( w ⋅ x i ) − log ( 1 + e x p ( w ⋅ x i ) ] L(w)=\sum_{i=1}^{N}[y_i(w\cdot x_i)-\log(1+exp(w\cdot x_i)] L(w)=∑i=1N[yi(w⋅xi)−log(1+exp(w⋅xi)],可以采用梯度下降法和拟牛顿法求解。
2. python实现
2.1 逻辑斯谛回归
# -*- encoding: utf-8 -*-
'''
@File : Chapter5_logistic.py
@Contact : zzldouglas97@gmail.com
@Modify Time @Author @Version @Desciption
------------ ------- -------- -----------
2019/5/6 23:37 Douglas.Z 1.0 None
'''
# import lib
import numpy as np
import matplotlib.pyplot as plt
import random
from matplotlib.font_manager import FontProperties
def loadDataSet():
"""
加载数据集
:return:
"""
dataMat = []
labelMat = []
# 打开文件
fr = open('testSet.txt')
# 遍历每一行
for line in fr.readlines():
# 切分每一行生成array
lineArr = line.strip().split()
# 将第一二个放到dataMat
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
# 将第三个放到labelMat
labelMat.append(int(lineArr[2]))
return dataMat, labelMat
def sigmoid(inX):
"""
sigmoid阶跃函数
:param inX:
:return:
"""
return 1.0 / (1 + np.exp(-inX))
def grandAscent(dataMatIn, classLabels):
"""
梯度上升算法
:param dataMatIn:
:param classLabels:
:return:
"""
dataMatrix = np.mat(dataMatIn)
labelMatrix = np.mat(classLabels).transpose()
m, n = dataMatrix.shape
alpha = 0.001
maxIter = 500
weights = np.ones((n, 1))
weight_array = np.array([])
for k in range(maxIter):
# sig(X * W)
h = sigmoid(dataMatrix * weights)
# error = Y - sig(X * W)
error = (labelMatrix - h)
# W1 = W0 + a * XT * (Y - sig(X * W)
weights = weights + alpha * dataMatrix.transpose() * error
weight_array = np.append(weight_array, weights)
weight_array = weight_array.reshape(maxIter, n)
return weights.getA(), weight_array
def SGA(dataMatIn, classLabels, numIter=150):
"""
随机梯度上升算法
:param dataMatIn:
:param classLabels:
:return:
"""
m, n = np.shape(dataMatIn)
weights = np.ones(n)
weight_array = np.array([])
# 设定迭代次数
for j in range(numIter):
dataIndex = list(range(m))
for i in range(m):
# 学习率递减
alpha = 4 / (1.0 + j + i) + 0.01
# 随机获取样本进行迭代
randIndex = int(random.uniform(0, len(dataIndex)))
h = sigmoid(np.sum(dataMatIn[randIndex] * weights))
error = (classLabels[randIndex] - h)
weights = weights + alpha * dataMatIn[randIndex] * error
weight_array = np.append(weight_array, weights, axis=0)
# 删除使用过的样本
del (dataIndex[randIndex])
weight_array = weight_array.reshape(numIter * m, n)
return weights, weight_array
def plotFitLine(dataSet, labelSet, weights):
"""
绘制决策边界
:param dataSet:
:param labelSet:
:param weights:
:return:
"""
dataArr = np.array(dataSet)
labelMat = labelSet
n = np.shape(dataSet)[0]
xcord1 = []
ycord1 = []
xcord2 = []
ycord2 = []
# 如果是第一类则放在cord1,否则放在cord2
for i in range(n):
if int(labelSet[i]) == 1:
xcord1.append(dataArr[i, 1])
ycord1.append(dataArr[i, 2])
else:
xcord2.append(dataArr[i, 1])
ycord2.append(dataArr[i, 2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
ax.scatter(xcord2, ycord2, s=30, c='green')
# 利用x1,x2关系求出x2
x = np.arange(-3.0, 3.0, 0.1)
y = (-weights[0] - weights[1] * x) / weights[2]
ax.plot(x, y)
plt.xlabel('X1')
plt.ylabel('X2')
plt.show()
def plotWeights(weights_array1, weights_array2):
# 设置汉字格式
font = FontProperties(fname=r"C:\Windows\Fonts\simsun.ttc", size=14)
# 将fig画布分隔成1行1列,不共享x轴和y轴,fig画布的大小为(13,8)
# 当nrow=3,nclos=2时,代表fig画布被分为六个区域,axs[0][0]表示第一行第一列
fig, axs = plt.subplots(nrows=3, ncols=2, sharex=False, sharey=False, figsize=(20, 10))
x1 = np.arange(0, len(weights_array1), 1)
# 绘制w0与迭代次数的关系
axs[0][0].plot(x1, weights_array1[:, 0])
axs0_title_text = axs[0][0].set_title(u'梯度上升算法:回归系数与迭代次数关系', FontProperties=font)
axs0_ylabel_text = axs[0][0].set_ylabel(u'W0', FontProperties=font)
plt.setp(axs0_title_text, size=20, weight='bold', color='black')
plt.setp(axs0_ylabel_text, size=20, weight='bold', color='black')
# 绘制w1与迭代次数的关系
axs[1][0].plot(x1, weights_array1[:, 1])
axs1_ylabel_text = axs[1][0].set_ylabel(u'W1', FontProperties=font)
plt.setp(axs1_ylabel_text, size=20, weight='bold', color='black')
# 绘制w2与迭代次数的关系
axs[2][0].plot(x1, weights_array1[:, 2])
axs2_xlabel_text = axs[2][0].set_xlabel(u'迭代次数', FontProperties=font)
axs2_ylabel_text = axs[2][0].set_ylabel(u'W2', FontProperties=font)
plt.setp(axs2_xlabel_text, size=20, weight='bold', color='black')
plt.setp(axs2_ylabel_text, size=20, weight='bold', color='black')
x2 = np.arange(0, len(weights_array2), 1)
# 绘制w0与迭代次数的关系
axs[0][1].plot(x2, weights_array2[:, 0])
axs0_title_text = axs[0][1].set_title(u'改进的随机梯度上升算法:回归系数与迭代次数关系', FontProperties=font)
axs0_ylabel_text = axs[0][1].set_ylabel(u'W0', FontProperties=font)
plt.setp(axs0_title_text, size=20, weight='bold', color='black')
plt.setp(axs0_ylabel_text, size=20, weight='bold', color='black')
# 绘制w1与迭代次数的关系
axs[1][1].plot(x2, weights_array2[:, 1])
axs1_ylabel_text = axs[1][1].set_ylabel(u'W1', FontProperties=font)
plt.setp(axs1_ylabel_text, size=20, weight='bold', color='black')
# 绘制w2与迭代次数的关系
axs[2][1].plot(x2, weights_array2[:, 2])
axs2_xlabel_text = axs[2][1].set_xlabel(u'迭代次数', FontProperties=font)
axs2_ylabel_text = axs[2][1].set_ylabel(u'W1', FontProperties=font)
plt.setp(axs2_xlabel_text, size=20, weight='bold', color='black')
plt.setp(axs2_ylabel_text, size=20, weight='bold', color='black')
plt.show()
if __name__ == '__main__':
dataset, labelset = loadDataSet()
weight1, weightArray1 = grandAscent(dataset, labelset)
weight2, weightArray2 = SGA(np.array(dataset), labelset)
plotFitLine(dataset, labelset, weight1)
print(weight1)
print(weight2)
plotWeights(weightArray1, weightArray2)
2.2 病马预测
# -*- encoding: utf-8 -*-
'''
@File : Chapter5_Horse.py
@Contact : zzldouglas97@gmail.com
@Modify Time @Author @Version @Desciption
------------ ------- -------- -----------
2019/5/7 12:36 Douglas.Z 1.0 None
'''
# import lib
import numpy as np
import random
def sigmoid(inX):
"""
sigmoid阶跃函数
:param inX:
:return:
"""
# 优化方法
if inX >= 0:
return 1.0/(1+np.exp(-inX))
else:
return np.exp(inX)/(1+np.exp(inX))
# return 1.0 / (1 + np.exp(-inX))
def grandAscent(dataMatIn, classLabels, maxIter):
"""
梯度上升算法
:param dataMatIn:
:param classLabels:
:return:
"""
dataMatrix = np.mat(dataMatIn)
labelMatrix = np.mat(classLabels).transpose()
m, n = dataMatrix.shape
alpha = 0.001
weights = np.ones((n, 1))
weight_array = np.array([])
for k in range(maxIter):
# sig(X * W)
h = sigmoid(dataMatrix * weights)
# error = Y - sig(X * W)
error = (labelMatrix - h)
# W1 = W0 + a * XT * (Y - sig(X * W)
weights = weights + alpha * dataMatrix.transpose() * error
weight_array = np.append(weight_array, weights)
weight_array = weight_array.reshape(maxIter, n)
return weights.getA(), weight_array
def SGA(dataMatIn, classLabels, numIter=150):
"""
随机梯度上升算法
:param dataMatIn:
:param classLabels:
:return:
"""
m, n = np.shape(dataMatIn)
weights = np.ones(n)
weight_array = np.array([])
# 设定迭代次数
for j in range(numIter):
dataIndex = list(range(m))
for i in range(m):
# 学习率递减
alpha = 4 / (1.0 + j + i) + 0.01
# 随机获取样本进行迭代
randIndex = int(random.uniform(0, len(dataIndex)))
h = sigmoid(np.sum(dataMatIn[randIndex] * weights))
error = (classLabels[randIndex] - h)
weights = weights + alpha * dataMatIn[randIndex] * error
weight_array = np.append(weight_array, weights, axis=0)
# 删除使用过的样本
del (dataIndex[randIndex])
weight_array = weight_array.reshape(numIter * m, n)
return weights, weight_array
def classifyVector(inX, weights):
"""
分类函数
:param inX:
:param weights:
:return:
"""
prob = sigmoid(sum(inX * weights))
if prob > 0.5:
return 1.0
else:
return 0.0
def colicTest():
"""
测试
:return:
"""
frTrain = open('horseColicTraining.txt')
frTest = open('horseColicTest.txt')
traningSet = []
traningLabels = []
for line in frTrain.readlines():
currLine = line.strip().split('\t')
lineArr = []
for i in range(len(currLine)-1):
lineArr.append(float(currLine[i]))
traningSet.append(lineArr)
traningLabels.append(float(currLine[-1]))
traningWeights = SGA(np.array(traningSet), traningLabels, 500)
errorCount = 0.0
numTestVec = 0.0
for line in frTest.readlines():
numTestVec += 1.0
currLine = line.strip().split('\t')
lineArr = []
for i in range(21):
lineArr.append(float(currLine[i]))
if int(classifyVector(np.array(lineArr), traningWeights)) != int(currLine[21]):
errorCount += 1
erroRate = (float(errorCount) / numTestVec)
return erroRate
if __name__ == '__main__':
numTest = 10
errorSum = 0.0
for k in range(numTest):
errorSum += colicTest()
print("after {}d iterations the average error rate is: {}".format(numTest, errorSum / float(numTests)))
# print(errorSum / float(numTest))
3. 参考资料
4. 总结
优点:计算代价不高,易于实现
缺点:容易欠拟合,分类精度可能不高
适用数据类型:数值型和标称型数据