逻辑回归数学理论推导:
从疝气病症预测病马死亡率:
import numpy as np
import random
#导入数据
def loadDataSet(filename):
dataSet = []
labelSet = []
fr = open(filename)
for line in fr.readlines():
currLine = line.strip().split()
lineArr = []
n = len(currLine[:-1])
for i in range(n):
lineArr.append(float(currLine[i]))
dataSet.append(lineArr)
labelSet.append(float(currLine[-1]))
return dataSet,labelSet
def sigmoid(inX):
#return np.longfloat(1.0 / (1.0 + np.exp(-inX)))
return 1.0/(1.0+np.exp(-inX))
#随机梯度上升计算最佳回归系数
def stocGradAscent(dataMatrix,classLabels,numIter=150):
m,n = np.shape(dataMatrix)
weights = np.ones(n)
dataMat = np.array(dataMatrix)
for j in range(numIter):
dataIndex = range(m)
for i in range(m):
alpha = 4/(1.0+j+i)+0.01
randeIndex = int(random.uniform(0,len(dataIndex)))
h = sigmoid(sum(dataMat[randeIndex]*weights))
error = classLabels[randeIndex] - h
weights = weights + alpha*error*dataMat[randeIndex]
del(dataIndex[randeIndex])
return weights
#从疝气病症预测病马死亡的错误率
def colicTest():
frTrain = 'horseColicTraining.txt'
frTest = 'horseColicTest.txt'
trainSet,trainLabels = loadDataSet(frTrain)
trainWeights = stocGradAscent(trainSet,trainLabels,500)
errorCount = 0
numTestVec = 0
testSet,testLabel = loadDataSet(frTest)
dataArr = np.array(testSet)
m,n = np.shape(dataArr)
for i in range(m):
numTestVec += 1
if int(classifyVector(dataArr[i], trainWeights)) != int(testLabel[i]):
errorCount += 1
errorRate = (float(errorCount)/numTestVec)
print "测试集的错误率为 %f" % errorRate
return errorRate
#多次计算错误率求平均值
def multiTest():
numTests = 10
errorSum = 0.0
for k in range(numTests):
errorSum += colicTest()
print "%d 次测试结果的平均错误率:%f" % (numTests,errorSum/float(numTests))
#以逻辑回归模型对数据进行分类
def classifyVector(inX,weights):
#inX应为numpy数组形式
prob = sigmoid(sum(inX*weights))
if prob > 0.5:
return 1.0
else:
return 0.0
# filename = 'horseColicTraining.txt'
# dataMatrix,classLabels = loadDataSet(filename)
# weights = stocGradAscent(dataMatrix,classLabels)
# print weights
if __name__ == "__main__":
multiTest()
计算F1值
预测值 | |||
---|---|---|---|
正例 | 负例 | ||
真实值 | 正例 | 真正例(TP) | 假负例(FN) |
负例 | 假正例(FP) | 真负例(TN) |
准确率:Accuracy = (TP+TN)/(TP+FN+FP+TN)
召回率:Recall = TP/(TP+FN)
精准率:Precision = TP/(TP+FP)
F1为召回率与精准率的调和平均值
F1 = 2*Recall*Precision / (Recall+Precision)
#计算回归系数
def calcWeights():
trainSet = [];trainLabels = []
numFeat = len(open('Datas/ch05/horseColicTraining.txt').readline().split('\t')) - 1
frTrain = open('Datas/ch05/horseColicTraining.txt')
for line in frTrain.readlines():
lineArr = []
curline = line.strip().split('\t')
for i in range(numFeat):
lineArr.append(float(curline[i]))
trainSet.append(lineArr)
trainLabels.append(float(curline[-1]))
#计算回归系数
trainWeights = stocGradAscent1(np.array(trainSet),trainLabels,500)
return trainWeights
#测试数据预测结果
def calcResult(trainWeights):
testData = [];testLabel = []
numFeat = len(open('Datas/ch05/horseColicTest.txt').readline().split('\t')) - 1
frTest = open('Datas/ch05/horseColicTest.txt')
for line in frTest.readlines():
lineArr = []
curline = line.strip().split('\t')
for i in range(numFeat):
lineArr.append(float(curline[i]))
testData.append(lineArr)
testLabel.append(float(curline[-1]))
m,n = np.shape(testData)
predictLabel = []
for i in range(m):
result = classifyVector(testData[i],trainWeights)
predictLabel.append(result)
#计算TP、FN、FP、TN,F1值
TP = FN = FP = TN = 0.0
for i in range(len(predictLabel)):
if testLabel[i] == 1.0 and predictLabel[i] == testLabel[i]:
TP += 1
elif testLabel[i] == 1.0 and predictLabel[i] != testLabel[i]:
FN += 1
elif testLabel[i] == 0.0 and predictLabel[i] == testLabel[i]:
TN += 1
else:
FP += 1
Recall = TP/(TP+FN)
Precision = TP/(TP+FP)
F1 = 2*Precision*Recall / (Precision + Recall)
return predictLabel,F1
trainWeights = calcWeights()
predictLabels,F1 = calcResult(trainWeights)
print F1