logistic回归
import feedparser
from numpy import *
def loadDateSet():
dataMat=[]
labelMat=[]
fr=open('F:\\learn\\machinelearninginaction\\Ch05\\testSet.txt')
for line in fr.readlines():
# 对当前行去除首尾空格,并按空格进行分离
lineArr=line.strip().split()
# 将每一行的前两个数据即数组两个特征x1,x2,x0=0组成列表并添加到数据集列表中
#增加额外的x0=0是为了后续方便计算
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
# 将每一行的第三个数据即标签添加到标签列表中
labelMat.append(int(lineArr[2]))
return dataMat,labelMat
#sigmid函数用来将计算结果分成两类
def sigmid(inx):
return 1.0/(1+exp(-inx))
#梯度上升算法的具体实现
def gradAscent(dataMat,classLabelMat):
#将数据从一个2位数组转变成矩阵
dataMatrix=mat(dataMat)
#将标签列表转换成一个单行向量然后再转置成列向量
labelMat=mat(classLabelMat).transpose()
m,n=shape(dataMatrix)
#移动的步长
alpha=0.001
#迭代的最大次数
maxCycles=500
#n*1的数组
weights=ones((n,1))
for k in range(maxCycles):
#h计算的是每个样本特征乘以系数并累加后,作为sigmoid函数的输入得到的输出。是个100*1的列向量
h=sigmid(dataMatrix*weights)
#error是计算h中每个值与labelMat中对应值的误差
error=(labelMat-h)
#根据迭代公式 新的特征向量=旧的特征向量+步长乘以梯度
#数据集转置乘以error就是梯度
weights=weights+alpha*dataMatrix.transpose()*error
return weights
#随机梯度上升
def stocGradAscent0(dataMatrix,classLabels):
dataMatrix = array(dataMatrix)
m,n=shape(dataMatrix)
alpha=0.01
weights=ones(n)
for i in range(m):
h=sigmid(sum(dataMatrix[i]*weights))
error=classLabels[i]-h
weights=weights+alpha*error*dataMatrix[i]
return weights
#改进的随机梯度上升
def stocGradAscent1(dataMatrix,classLabels,numIter=150):
dataMatrix = array(dataMatrix)
m,n=shape(dataMatrix)
weights=ones(n)
for j in range(numIter):
dataIndex = list(range(m))
for i in range(m):
alpha = 4 / (1.0 + j + i) + 0.0001
randIndex = int(random.uniform(0, len(dataIndex)))
h=sigmid(sum(dataMatrix[randIndex]*weights))
error=classLabels[randIndex]-h
weights=weights+alpha*error*dataMatrix[randIndex]
del (dataIndex[randIndex])
return weights
def plotBestFit(weights):
import matplotlib.pyplot as plt
dataMat,labelMat=loadDateSet()
dataArr=array(dataMat)
#n等于数据的个数
n=shape(dataArr)[0]
xcord1=[]
xcord2=[]
ycord1=[]
ycord2=[]
for i in range(n):
#将类别1的点的x1特征当做x坐标x2特征当做y坐标画在坐标系上
if int(labelMat[i])==1:
xcord1.append(dataArr[i,1])
ycord1.append(dataArr[i,2])
else:
# 将类别0的点的x1特征当做x坐标x2特征当做y坐标画在坐标系上
xcord2.append(dataArr[i,1])
ycord2.append(dataArr[i,2])
fig=plt.figure()
# 图像分为一行一列并且将图像画在第一块画布上
ax=fig.add_subplot(111)
# 标签为1的点为正方形
ax.scatter(xcord1,ycord1,s=30,c='red',marker='s')
# 标签为0的点为原形
ax.scatter(xcord2,ycord2,s=30,c='green')
# 画出最佳拟合直线
x=arange(-3.0,3.0,0.1)
y=(-weights[0]-weights[1]*x)/weights[2]
ax.plot(x,y)
plt.xlabel('X1')
plt.ylabel('x2')
plt.show()
dataArr,labelMat=loadDateSet()
weights=stocGradAscent1(dataArr,labelMat)
plotBestFit(weights)
值得注意的是倒数第4行。我们有了最佳回归系数之后,就可以列出最佳拟合直线的方程。根据sigmoid函数的性质,横坐标为0处是两个分类(0和1)的分界线,所以令sigmoid函数的输入为0,就得到了分界线的方程:
w0x0+w1x1+w2x2=0
其中x0是1,所以 x2=(-w0-w1x1) / w2
使用logistic回归预测病马的死亡率
import feedparser
from numpy import *
#sigmid函数用来将计算结果分成两类
def sigmid(inx):
return 1.0/(1+exp(-inx))
def classifyVector(inx,weights):
prob=sigmid(sum(inx*weights))
if prob>0.5:
return 1.0
else:
return 0.0
#改进的随机梯度上升
def stocGradAscent1(dataMatrix,classLabels,numIter=150):
dataMatrix = array(dataMatrix)
m,n=shape(dataMatrix)
weights=ones(n)
for j in range(numIter):
dataIndex = list(range(m))
for i in range(m):
alpha = 4 / (1.0 + j + i) + 0.0001
randIndex = int(random.uniform(0, len(dataIndex)))
h=sigmid(sum(dataMatrix[randIndex]*weights))
error=classLabels[randIndex]-h
weights=weights+alpha*error*dataMatrix[randIndex]
del (dataIndex[randIndex])
return weights
def colicTest():
frTrain=open('F:\\learn\\machinelearninginaction\\Ch05\\horseColicTraining.txt')
frTest = open('F:\\learn\\machinelearninginaction\\Ch05\\horseColicTest.txt')
trainingSet=[]
trainingLables=[]
for line in frTrain.readlines():
currLine=line.strip().split('\t')
lineArr=[]
for i in range(21):
lineArr.append(float(currLine[i]))
trainingSet.append(lineArr)
trainingLables.append(float(currLine[21]))
trainWeights=stocGradAscent1(trainingSet,trainingLables,500)
errorCount=0
numTestVec=0.0
for line in frTest.readlines():
numTestVec+=1.0
currLine = line.strip().split('\t')
lineArr = []
for i in range(21):
lineArr.append(float(currLine[i]))
if int(classifyVector(lineArr,trainWeights))!=int(currLine[21]):
errorCount+=1
errorRate=(float(errorCount)/numTestVec)
print('错误率是:%f'%errorRate)
return errorRate
def multiTest():
numTests=10
errorSum=0.0
for k in range(numTests):
errorSum+=colicTest()
print('在%d的迭代过后平均错误率等于%f'%(numTests,errorSum/float(numTests)))
multiTest()