一、梯度上升求最佳回归系数
1.11 实例示例及注释
# coding: utf-8
from numpy import *
import matplotlib.pyplot as plt
import sys
def loadDataSet():
dataMat = []; labelMat = []
fr = open( "testSet.txt" )
for line in fr.readlines():
lineArr = line.strip().split()
dataMat.append( [ 1.0, float(lineArr[0]), float(lineArr[1]) ] )
labelMat.append( int(lineArr[2]) )
return dataMat, labelMat
def sigmoid( inX ):
return 1.0 / ( 1 + exp(-inX) )
def gradAscent( dataMatIn, classLabels ):
''' 回归梯度上升优化算法
dataMatIn 每行代表一个训练样本 100 x 3的矩阵
classLabels 100 x 1的矩阵
'''
dataMatrix = mat( dataMatIn )
labelMat = mat( classLabels ).transpose() # 标签矩阵转置
m, n = shape( dataMatrix ) # 得到矩阵的行和列
alpha = 0.001 # 目标移动的步长
maxCycles = 500 # 重复次数
weights = ones((n, 1)) # 初始化回归系数 3 x 1 矩阵
for k in range( maxCycles ):
h = sigmoid( dataMatrix * weights )
error = (labelMat - h)
weights = weights + alpha * dataMatrix.transpose() * error # 梯度计算
return weights
def stocGradAscent0( dataMatrix, classLabels ):
''' 随机梯度上升算法 '''
m, n = shape( dataMatrix )
alpha = 0.01
weights = ones( n )
for i in range( m ):
h = sigmoid( sum(dataMatrix[i] * weights) )
error = classLabels[i] - h
weights = weights + alpha * error * dataMatrix[i]
return weights
def stocGradAscent1( dataMatrix, classLabels, numIter = 150 ):
''' 改进的随机梯度上升算法 '''
m, n = shape( dataMatrix )
alpha = 0.01
weights = ones( n )
for j in range( numIter ):
dataIndex = range(m)
for i in range(m):
alpha = 4 / (1.0 + j + i) + 0.01
randIndex = int( random.uniform(0, len(dataIndex)) )
h = sigmoid( sum(dataMatrix[i] * weights) )
error = classLabels[ randIndex ] - h
weights = weights + alpha * error * dataMatrix[randIndex]
# del( dataIndex[randIndex] )
return weights
def plotBestFit( weights ):
dataMat, labelMat = loadDataSet()
dataArr = array( dataMat )
n = shape( dataArr )[0]
xcord1 = []; ycord1 = []
xcord2 = []; ycord2 = []
for i in range(n):
if int(labelMat[i]) == 1:
xcord1.append( dataArr[i, 1] )
ycord1.append( dataArr[i, 2] )
else:
xcord2.append( dataArr[i, 1] )
ycord2.append( dataArr[i, 2] )
fig = plt.figure()
ax = fig.add_subplot( 111 )
ax.scatter( xcord1, ycord1, s = 30, c = 'red', marker = 's' )
ax.scatter( xcord2, ycord2, s = 30, c = 'green' )
x = arange( -3.0, 3.0, 0.1 )
y = (-weights[0] - weights[1] * x) / weights[2]
ax.plot(x, y)
plt.xlabel('X1')
plt.ylabel('X2')
plt.show()
def main():
dataArr, labelArr = loadDataSet()
'''
weights = gradAscent( dataArr, labelArr)
plotBestFit( weights.getA() )
'''
weights = stocGradAscent1( array(dataArr), labelArr)
plotBestFit( weights )
if __name__=="__main__":
main()
1.2 结果展示
回归梯度上升优化算法:
随机梯度上升算法:
改进的随机梯度上升算法:
二、使用Logistic回归估计马疝病的死亡率
2.1 程序示例及注释
# coding: utf-8
from numpy import *
import matplotlib.pyplot as plt
import sys
def loadDataSet():
dataMat = []; labelMat = []
fr = open( "testSet.txt" )
for line in fr.readlines():
lineArr = line.strip().split()
dataMat.append( [ 1.0, float(lineArr[0]), float(lineArr[1]) ] )
labelMat.append( int(lineArr[2]) )
return dataMat, labelMat
def sigmoid( inX ):
return 1.0 / ( 1 + exp(-inX) )
def gradAscent( dataMatIn, classLabels ):
''' 回归梯度上升优化算法
dataMatIn 每行代表一个训练样本 100 x 3的矩阵
classLabels 100 x 1的矩阵
'''
dataMatrix = mat( dataMatIn )
labelMat = mat( classLabels ).transpose() # 标签矩阵转置
m, n = shape( dataMatrix ) # 得到矩阵的行和列
alpha = 0.001 # 目标移动的步长
maxCycles = 500 # 重复次数
weights = ones((n, 1)) # 初始化回归系数 3 x 1 矩阵
for k in range( maxCycles ):
h = sigmoid( dataMatrix * weights )
error = (labelMat - h)
weights = weights + alpha * dataMatrix.transpose() * error # 梯度计算
return weights
def stocGradAscent0( dataMatrix, classLabels ):
''' 随机梯度上升算法 '''
m, n = shape( dataMatrix )
alpha = 0.01
weights = ones( n )
for i in range( m ):
h = sigmoid( sum(dataMatrix[i] * weights) )
error = classLabels[i] - h
weights = weights + alpha * error * dataMatrix[i]
return weights
def stocGradAscent1( dataMatrix, classLabels, numIter = 150 ):
''' 改进的随机梯度上升算法 '''
m, n = shape( dataMatrix )
alpha = 0.01
weights = ones( n )
for j in range( numIter ):
dataIndex = range(m)
for i in range(m):
alpha = 4 / (1.0 + j + i) + 0.01
randIndex = int( random.uniform(0, len(dataIndex)) )
h = sigmoid( sum(dataMatrix[i] * weights) )
error = classLabels[ randIndex ] - h
weights = weights + alpha * error * dataMatrix[randIndex]
# del( dataIndex[randIndex] )
return weights
def plotBestFit( weights ):
dataMat, labelMat = loadDataSet()
dataArr = array( dataMat )
n = shape( dataArr )[0]
xcord1 = []; ycord1 = []
xcord2 = []; ycord2 = []
for i in range(n):
if int(labelMat[i]) == 1:
xcord1.append( dataArr[i, 1] )
ycord1.append( dataArr[i, 2] )
else:
xcord2.append( dataArr[i, 1] )
ycord2.append( dataArr[i, 2] )
fig = plt.figure()
ax = fig.add_subplot( 111 )
ax.scatter( xcord1, ycord1, s = 30, c = 'red', marker = 's' )
ax.scatter( xcord2, ycord2, s = 30, c = 'green' )
x = arange( -3.0, 3.0, 0.1 )
y = (-weights[0] - weights[1] * x) / weights[2]
ax.plot(x, y)
plt.xlabel('X1')
plt.ylabel('X2')
plt.show()
def classifyVector( inX, weights ):
prob = sigmoid( sum(inX * weights) )
if prob > 0.5:
return 1.0
else:
return 0.0
def colicTest():
frTrain = open("horseColicTraining.txt")
frTest = open("horseColicTest.txt")
trainingSet = []
trainingLabels = []
for line in frTrain.readlines():
currLine = line.strip().split('\t')
lineArr = []
for i in range(21):
lineArr.append( float(currLine[i]) )
trainingSet.append( lineArr )
trainingLabels.append( float(currLine[2]) )
trainWeights = stocGradAscent1( array(trainingSet), trainingLabels, 500 )
errorCount = 0
numTestVec = 0.0
for line in frTest.readlines():
numTestVec += 1.0
currLine = line.strip().split('\t')
lineArr = []
for i in range(21):
lineArr.append( float(currLine[i]) )
if int(classifyVector( array(lineArr), trainWeights )) != int(currLine[21]):
errorCount += 1
errorRate = (float(errorCount) / numTestVec)
print "The error rate of this test is: %f" % errorRate
return errorRate
def multiTest():
numTests = 10
errorSum = 0.0
for k in range( numTests ):
errorSum += colicTest()
print "after %d iterations the average error rate is: %f" % (numTests, errorSum / float(numTests))
def main():
dataArr, labelArr = loadDataSet()
'''
weights = gradAscent( dataArr, labelArr)
plotBestFit( weights.getA() )
'''
'''
weights = stocGradAscent1( array(dataArr), labelArr)
plotBestFit( weights )
'''
multiTest()
if __name__=="__main__":
main()
2.1 结果展示
由于有缺失数据,错误率有将近30%,不是很准确,可以修改参数来提高准确率,但提高的不是很多。