logistic回归的基本思想
logistic回归是一种分类方法,用于两分类问题。其基本思想为:
a. 寻找合适的假设函数,即分类函数,用以预测输入数据的判断结果;
b. 构造代价函数,即损失函数,用以表示预测的输出结果与训练数据的实际类别之间的偏差;
c. 最小化代价函数,从而获取最优的模型参数。
1 import numpy 2 from numpy import * 3 import matplotlib.pyplot as plt 4 import random 5 def loadDataSet(filename): 6 fr = open(filename) 7 dataMat = [] 8 labelMat = [] 9 for line in fr.readlines(): 10 lineArr = line.strip().split() 11 dataMat.append( [1.0,float(lineArr[0]),float(lineArr[1])] ) 12 labelMat.append(int(lineArr[2])) 13 return dataMat,labelMat 14 15 #阶跃函数 16 def sigmoid(inX): 17 return 1.0/(1 + numpy.exp(-inX)) 18 19 #基于梯度上升法的logistic回归分类器 20 def gradAscent(dataMatIn,classLabels): 21 dataMatrix = mat(dataMatIn) 22 labelMatrix = mat(classLabels).transpose() 23 m , n = shape(dataMatrix) 24 alpha = 0.001#步长 25 maxCycles = 500 26 weights = ones((n,1)) 27 #对回归系数进行maxCycles次梯度上升 28 for i in range(maxCycles): 29 h = sigmoid(dataMatrix * weights) 30 error = labelMatrix - h 31 weights = weights + alpha * dataMatrix.transpose() * error 32 return weights 33 34 #分析数据:画出决策边界 35 def plotBestFit(weights): 36 dataMat,labelMat = loadDataSet('test.txt') 37 dataArr = array(dataMat) 38 n = list(shape(dataArr))[0] 39 xcord1 = [] ; ycord1 = [] 40 xcord2 = [] ; ycord2 = [] 41 for i in range(n): 42 if int(labelMat[i]) == 1: 43 xcord1.append(dataArr[i,1]) 44 ycord1.append(dataArr[i,2]) 45 else: 46 xcord2.append(dataArr[i,1]) 47 ycord2.append(dataArr[i,2]) 48 fig = plt.figure() 49 ax = fig.add_subplot(111) 50 ax.scatter(xcord1,ycord1,s=30,c='red',marker='s') 51 ax.scatter(xcord2,ycord2,s=30,c='green') 52 53 #最佳拟合直线 54 x = arange(-3.0, 3.0, 0.1) 55 print('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx',shape(x)) 56 57 y = (-weights[0] - weights[1] * x) / weights[2] 58 print('-----------------------------------------',shape(y)) 59 ax.plot(x,y) 60 plt.xlabel('X1') 61 plt.ylabel('X2') 62 plt.show() 63 64 #随机梯度上升 65 def stocGradAscent0(dataMatrix,classLabels): 66 m , n = numpy.shape(dataMatrix) 67 alpha = 0.01#步长 68 weights = numpy.ones((n)) 69 for i in range(m): 70 h = sigmoid(sum(dataMatrix[i] * weights)) 71 error = classLabels[i] - h 72 weights = weights + alpha * error * dataMatrix[i] 73 return weights 74 75 #改进的随机梯度上升 76 def stocGradAscent1(dataMatrix,classLabels,numIter=150): 77 m , n = shape(dataMatrix) 78 weights = ones(n) 79 dataIndex = list(range(m)) 80 print (dataIndex) 81 for j in range(numIter): 82 for i in range(m): 83 alpha = 4/(1.0+j+i) + 0.1 #alpha每次迭代都要调整 84 randIndex = int(random.uniform(0,len(dataIndex))) 85 h = sigmoid (sum(dataMatrix[randIndex] * weights)) 86 error = classLabels[randIndex] - h 87 weights = weights + alpha * error * dataMatrix[randIndex] 88 del dataIndex[randIndex] 89 print("randIndex",randIndex) 90 print("dataIndex",dataIndex) 91 if randIndex==0: 92 return weights 93 94 95 if __name__ == '__main__': 96 dataArr,labelMat = loadDataSet('test.txt') 97 weights = stocGradAscent1(array(dataArr),labelMat) 98 # weights = gradAscent(dataArr,labelMat) 99 # print(shape(weights)) 100 plotBestFit(weights)
应用:从疝气病预测病马的死亡率
import numpy
from numpy import *
import matplotlib.pyplot as plt
import random
#阶跃函数
def sigmoid(inX):
return 1.0/(1 + numpy.exp(-inX))
#分类回归函数
def classifyVector(inX,weights):
prob = sigmoid(sum(inX * weights))
if prob > 0.5:
return 1.0
else:
return 0.0
#改进的随机梯度上升算法
def stocGradAscent1(dataMatrix, classLabels, numIter=150):
m, n = shape (dataMatrix)
weights = ones (n)
dataIndex = list (range (m))
for j in range (numIter):
for i in range (m):
alpha = 4 / (1.0 + j + i) + 0.1 # alpha每次迭代都要调整
randIndex = int (random.uniform (0, len (dataIndex)))
h = sigmoid (sum (dataMatrix[randIndex] * weights))
error = classLabels[randIndex] - h
weights = weights + alpha * error * dataMatrix[randIndex]
del dataIndex[randIndex]
if randIndex == 0:
return weights
#测试,返回错误率
def colicTest():
frTrain = open('horseColicTraining.txt')
frTest = open('horseColicTest.txt')
trainingSet = []
trainingLabels = []
for line in frTrain.readlines():
curLine = line.strip().split('\t')
lineArr = []
for i in range(21):
lineArr.append(float(curLine[i]))
trainingSet.append(lineArr)
trainingLabels.append(float(curLine[21]))
trainWeights = stocGradAscent1(array(trainingSet),trainingLabels,500)
errorCount = 0
numTestVec = 0
for line in frTest.readlines():
numTestVec += 1.0
curLine = line.strip().split('\t')
lineArr = []
for i in range(21):
lineArr.append(float(curLine[i]))
if int(classifyVector(array(lineArr),trainWeights)) != int(curLine[21]):
errorCount += 1
errorRate = (float(errorCount)/numTestVec)
print("错误率",errorRate)
return errorRate
def multiTest():
numTests = 10
errorSum = 0.0
for i in range(numTests):
errorSum += colicTest()
print("%d 次迭代之后,平均错误率为%f"%(numTests,errorSum/float(numTests)))
multiTest()