根据《机器学习实战》这本书的讲法,训练集选择了299条病马的特征数据,每一条一共有22项,前21项为马的相应特征,最后第22项为马最终是否存活。
测试集一共有67条数据,最终进行10次计算后平均错误率在38%左右。
def stocGradAscent1(dataMatrix,classLabels,numIter = 150): #随机梯度上升算法
m,n = shape(dataMatrix)
weights = ones(n)
for j in range(numIter):
dataIndex = range(m)
for i in range(m):
alpha = 4/(1.0+j+i)+0.01
randIndex = int(random.uniform(0,len(dataIndex)))
h = sigmoid(sum(dataMatrix[randIndex]*weights))
error = classLabels[randIndex] - h
weights = weights + alpha*dataMatrix[randIndex]*error
del(dataIndex[randIndex])
return weights
def classifyVector(inX, weights): #用求出的合适的相关性系数weights预测马的死亡概率
prob = sigmoid(sum(inX*weights))
if prob>0.5:
return 1.0
else:
return 0.0
def colicTest():
frTrain = open("horseColicTraining.txt")
frTest = open("horseColicTest.txt")
trainningSet = [];trainningLabels = []
for line in frTrain.readlines():
currLine = line.strip().split('\t')
lineArr = []
for i in range(21):
lineArr.append(float(currLine[i]))
trainningSet.append(lineArr)
trainningLabels.append(float(currLine[21]))
trainWeights = stocGradAscent1(array(trainningSet),trainningLabels,500)
errorCount = 0; numTestVec = 0.0
for line in frTest.readlines():
numTestVec += 1.0
currLine = line.strip().split('\t')
lineArr = []
for i in range(21):
lineArr.append(float(currLine[i]))
if int(classifyVector(array(lineArr),trainWeights))!= int(currLine[21]):
errorCount += 1
errorRate = float(errorCount/numTestVec)
print "the error rate of this test is :{0}".format(errorRate)
return errorRate
def multiTest(): #进行多次预测,计算平均值,因为是随机梯度上升算法,每次运行的结果都不一样
numTests = 10; errorSum = 0.0
for k in range(numTests):
errorSum += colicTest()
print "after %d iterations the average error rate is %f"% (numTests,errorSum/float(numTests))
if __name__=="__main__":
multiTest()
最终截图如下:
总结:感觉logistic回归最重要的一个函数还是迭代求相关性系数weights的,在求导方面我们借助了一个单位阶跃函数的近似sigmoid函数
w=w+a*梯度(f(x)),a为步长。