Logistic回归数学公式推导及疝气病症预测病马死亡率

逻辑回归数学理论推导:

从疝气病症预测病马死亡率:

import numpy as np
import random

#导入数据
def loadDataSet(filename):
    dataSet = []
    labelSet = []
    fr = open(filename)
    for line in fr.readlines():
        currLine = line.strip().split()
        lineArr = []
        n = len(currLine[:-1])
        for i in range(n):
            lineArr.append(float(currLine[i]))
        dataSet.append(lineArr)
        labelSet.append(float(currLine[-1]))
    return dataSet,labelSet

def sigmoid(inX):
    #return np.longfloat(1.0 / (1.0 + np.exp(-inX)))
    return 1.0/(1.0+np.exp(-inX))

#随机梯度上升计算最佳回归系数
def stocGradAscent(dataMatrix,classLabels,numIter=150):
    m,n = np.shape(dataMatrix)
    weights = np.ones(n)
    dataMat = np.array(dataMatrix)
    for j in range(numIter):
        dataIndex = range(m)
        for i in range(m):
            alpha = 4/(1.0+j+i)+0.01
            randeIndex = int(random.uniform(0,len(dataIndex)))
            h = sigmoid(sum(dataMat[randeIndex]*weights))
            error = classLabels[randeIndex] - h
            weights = weights + alpha*error*dataMat[randeIndex]
            del(dataIndex[randeIndex])
    return weights

#从疝气病症预测病马死亡的错误率
def colicTest():
    frTrain = 'horseColicTraining.txt'
    frTest = 'horseColicTest.txt'
    trainSet,trainLabels = loadDataSet(frTrain)
    trainWeights = stocGradAscent(trainSet,trainLabels,500)
    errorCount = 0
    numTestVec = 0
    testSet,testLabel = loadDataSet(frTest)
    dataArr = np.array(testSet)
    m,n = np.shape(dataArr)
    for i in range(m):
        numTestVec += 1
        if int(classifyVector(dataArr[i], trainWeights)) != int(testLabel[i]):
            errorCount += 1
    errorRate = (float(errorCount)/numTestVec)
    print "测试集的错误率为 %f" % errorRate
    return errorRate

#多次计算错误率求平均值
def multiTest():
    numTests = 10
    errorSum = 0.0
    for k in range(numTests):
        errorSum += colicTest()
    print "%d 次测试结果的平均错误率:%f" % (numTests,errorSum/float(numTests))

#以逻辑回归模型对数据进行分类
def classifyVector(inX,weights):
    #inX应为numpy数组形式
    prob = sigmoid(sum(inX*weights))
    if prob > 0.5:
        return 1.0
    else:
        return 0.0

# filename = 'horseColicTraining.txt'
# dataMatrix,classLabels = loadDataSet(filename)
# weights = stocGradAscent(dataMatrix,classLabels)
# print weights

if __name__ == "__main__":
    multiTest()

 

计算F1值

  预测值
  正例负例
真实值正例真正例(TP)假负例(FN)
负例假正例(FP)真负例(TN)

准确率:Accuracy = (TP+TN)/(TP+FN+FP+TN)

召回率:Recall = TP/(TP+FN)

精准率:Precision = TP/(TP+FP)

F1为召回率与精准率的调和平均值

F1 = 2*Recall*Precision / (Recall+Precision)

#计算回归系数
def calcWeights():
    trainSet = [];trainLabels = []
    numFeat = len(open('Datas/ch05/horseColicTraining.txt').readline().split('\t')) - 1
    frTrain = open('Datas/ch05/horseColicTraining.txt')
    for line in frTrain.readlines():
        lineArr = []
        curline = line.strip().split('\t')
        for i in range(numFeat):
            lineArr.append(float(curline[i]))
        trainSet.append(lineArr)
        trainLabels.append(float(curline[-1]))
    #计算回归系数
    trainWeights = stocGradAscent1(np.array(trainSet),trainLabels,500)
    return trainWeights

#测试数据预测结果
def calcResult(trainWeights):
    testData = [];testLabel = []
    numFeat = len(open('Datas/ch05/horseColicTest.txt').readline().split('\t')) - 1
    frTest = open('Datas/ch05/horseColicTest.txt')
    for line in frTest.readlines():
        lineArr = []
        curline = line.strip().split('\t')
        for i in range(numFeat):
            lineArr.append(float(curline[i]))
        testData.append(lineArr)
        testLabel.append(float(curline[-1]))
    m,n = np.shape(testData)
    predictLabel = []
    for i in range(m):
        result = classifyVector(testData[i],trainWeights)
        predictLabel.append(result)
    #计算TP、FN、FP、TN,F1值
    TP = FN = FP = TN = 0.0
    for i in range(len(predictLabel)):
        if testLabel[i] == 1.0 and predictLabel[i] == testLabel[i]:
            TP += 1
        elif testLabel[i] == 1.0 and predictLabel[i] != testLabel[i]:
            FN += 1
        elif testLabel[i] == 0.0 and predictLabel[i] == testLabel[i]:
            TN += 1
        else:
            FP += 1
    Recall = TP/(TP+FN)
    Precision = TP/(TP+FP)
    F1 = 2*Precision*Recall / (Precision + Recall)
    return predictLabel,F1

trainWeights = calcWeights()
predictLabels,F1 = calcResult(trainWeights)

print F1

 

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值