#coding=utf-8
from numpy import *
#----------加载数据-------------
def loadDataSet():
dataMat = []; labelMat = []
fr = open('testSet.txt') #读取样本
for line in fr.readlines():
lineArr = line.strip().split() #移除字符串头尾空各,并由空格分割字符串
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
labelMat.append(int(lineArr[2]))
return dataMat, labelMat #获得特征矩阵与标记矩阵
#----------sigmod函数----------
def sigmod(inX):
return 1.0/(1 + exp(-inX))
#---------梯度上升算法------------
def gradAscent(dataMatIn, classLabels):
dataMatrix = mat(dataMatIn) #转换为numpy矩阵
labelMat = mat(classLabels).transpose() #转换为numpy矩阵,并将行向量转置为列列向量
m, n = shape(dataMatrix) #获取行,列
alpha = 0.001 #移动步长
maxCycles = 500 #迭代次数
weights = ones((n, 1)) #回归系数初始化为1
for k in range(maxCycles): #w:=w+alpha*▽f(w),其中▽f(w)=dataMatrix.transpose()*error
h = sigmod(dataMatrix * weights) #计算预测值
error = (labelMat - h) #更新真实类别与预测类别的差值
weights = weights + alpha * dataMatrix.transpose()*error #使用真实预测的值与预测类别的差值更新回归系数
return weights
#------------画出数据集和Logistic回归最佳拟合直线函数---------
def plotBestFit(weights):
import matplotlib.pyplot as plt
dataMat, labelMat = loadDataSet()
dataArr = array(dataMat)
n = shape(dataMat)[0]
xcord1 = []; ycord1 = []
xcord2 = []; ycord2 = []
for i in range(n):
if int(labelMat[i]) == 1:
xcord1.append(dataArr[i, 1])
ycord1.append(dataArr[i, 2])
else:
xcord2.append(dataArr[i, 1])
ycord2.append(dataArr[i, 2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord1, ycord1, s = 30, c = 'red', marker='s')
ax.scatter(xcord2, ycord2, s = 30, c = 'green')
x = arange(-3.0, 3.0, 0.1)
y = (-weights[0] - weights[1]*x)/weights[2]
ax.plot(x, y)
plt.xlabel('X1');plt.ylabel('x2')
plt.show()
#随机梯度上升算法
def stocGradAscent0(dataMatrix, classLabel):
m, n = shape(dataMatrix)
alpha = 0.01
weights = ones(n)
for i in range(m): #在线学习,一次仅用一个样本更新回归系数
h = sigmod(sum(dataMatrix[i] * weights))
error = classLabel[i] - h
weights = weights + alpha * error *dataMatrix[i]
return weights
#改进的随机梯度上升算法,避免系数发生来回波动
def stocGradAscent1(dataMatrix, classLabels, numIter = 150):
m, n = shape(dataMatrix)
weights = ones(n)
for j in range(numIter):
dataIndex = range(m)
for i in range(m):
alpha = 4/(1.0 + j + i) + 0.01 #alpha随迭代次数减小,缓解数据波动或者高频波动
randIndex = int(random.uniform(0, len(dataIndex))) #随机选取样本更新回归系数,减少周期性波动
h = sigmod(sum(dataMatrix[randIndex] * weights))
error = classLabels[randIndex] - h
weights = weights + alpha * error *dataMatrix[randIndex]
del(dataIndex[randIndex]) #删除掉已使用过的样本
return weights
#-----------病马生死预测--------------
#以回归系数和特征向量作为输入计算对应的sigmod值
def classifyVector(inX, weights):
prob = sigmod(sum(inX * weights))
if prob > 0.5: return 1.0 #sigmod值大于0.5返回1
else: return 0.0 #否则返回0
#logsitic回归分类器训练函数与测试函数
def colicTest():
frTrain = open('horseColicTest.txt') #打开训练集
frTest = open('horseColicTest.txt') #打开测试集
trainningSet = []; trainningLabel = []
for line in frTrain.readlines(): #读取训练集
currLine = line.strip().split('\t')
lineArr = []
for i in range(21):
lineArr.append(float(currLine[i]))
trainningSet.append(lineArr)
trainningLabel.append(float(currLine[21]))
trainWeights = stocGradAscent1(array(trainningSet), trainningLabel, 500) #用随机梯度法训练分类器
errorCount = 0; numTestVec = 0.0
for line in frTest.readlines(): #读取测试集
numTestVec += 1.0
currLine = line.strip().split('\t')
lineArr = []
for i in range(21):
lineArr.append(float(currLine[i]))
if int(classifyVector(array(lineArr), trainWeights)) != int(currLine[21]): #计算错误率
errorCount += 1
errorRate = (float(errorCount)/numTestVec)
print 'the error rate of this test is: %f' %errorRate
return errorRate
#分类器测试函数,求10次测试结果的平均值
def multiTest():
numTests = 10; errorSum = 0.0
for k in range(numTests):
errorSum += colicTest()
print "after %d iterations the average eror rate is: %f" % (numTests, errorSum/float(numTests))
总结
- Logistic回归的目的是寻找一个非线性函数sigmod的最佳拟合参数,求解过程中可以由最优化算法来完成。在最优化算法中,最长用的就是梯度上升算法,而梯度上升算法又可以简化为随机梯度上升算法。
- 随机梯度上升算法与梯度上升算法效果相当,但是占用更少的计算资源。此外,随机梯度上升是一个在线算法,它可以在新数据到来时完成参数更新,而不需要重新读取整个数据集来进行批处理。
- 机器学习的一个重要问题就是如何处理缺失数据。这个问题没有标准答案,取决于实际应用中的需求。现有的解决方案中,每个方案都有优缺点。