以下代码来自《机器学习实战》,在每行代码后添加了详细的注释,以便理解】
Logistic回归的目的是寻找一个非线性函数sigmoid的最佳拟合参数,求解过程可以由最优化算法来完成。在最优化算法中,最常用的就是梯度上升,而梯度上升又可以简化为随机梯度上升
优点:计算代价不高,易于理解和实现。缺点:容易欠拟合,分类精度可能不高
从本地导入数据,返回数据特征矩阵和标签向量:
from numpy import *
def loadDataSet():
dataMat = []
labelMat = []
fr = open('./testSet.txt')
for line in fr.readlines():
lineArr = line.strip().split()
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])]) # x0的值默认设为1
labelMat.append(int(lineArr[2]))
return dataMat, labelMat
sigmoid函数:
# sigmoid函数
def sigmoid(inX):
return 1.0/(1+exp(-inX))
梯度上升算法;
# 梯度上升
def gradAscent(dataMatIn, classLabels):
dataMatrix = mat(dataMatIn) # 训练集特征值矩阵
labelMat = mat(classLabels).transpose() # 真实标签向量 y
m, n = shape(dataMatrix) # m 样本的个数, n 特征的个数
alpha = 0.001 # 步长
maxCycles = 500 # 最大迭代次数
weights = ones((n, 1)) # 参数初始化为1
for k in range(maxCycles):
h = sigmoid(dataMatrix * weights) # 预测返回值h
error = (labelMat - h) # 真实分类值与预测返回值的误差
weights = weights + alpha * dataMatrix.transpose() * error # 梯度上升
return weights # 返回迭代500次后的参数
随机梯度上升算法:
# 随机梯度上升
def stocGradAscent0(dataMatrix, classLabels):
m, n = shape(dataMatrix) # m 样本数 n 特征数
alpha = 0.01
weights = ones(n) # 为每个特征初始化参数为1
for i in range(m): #对每个样本
h = sigmoid(sum(dataMatrix[i] * weights))
error = classLabels[i] - h
weights = weights + alpha * error * dataMatrix[i]
return weights
改良随机梯度上升
#改良随机梯度上升
def stocGradAscent1(dataMatrix, classLabels, numIter=150):
m, n = shape(dataMatrix) # m 样本数 n 特征数
weights = ones(n) # 为每个特征初始化参数为1
for j in range(numIter): # 每次迭代,迭代150次
dataIndex = list(range(m)) # 所有样本的index
for i in range(m): # 对每个样本
alpha = 4 / (1.0 + j + i) + 0.01
randIndex = int(random.uniform(0, len(dataIndex))) # 并不取当前第i个样本 而是随机取一个randindex
h = sigmoid(sum(dataMatrix[randIndex] * weights))
error = classLabels[randIndex] - h
weights = weights + alpha * error * dataMatrix[randIndex]
del dataIndex[randIndex] # 计算完了 再本次迭代中删掉这个样本的index 避免重复使用
return weights
画图函数:
def plotBestFit1(wei):
import matplotlib.pyplot as plt
weights = wei
dataMat, labelMat = loadDataSet()
dataArr = array(dataMat)
n = shape(dataArr)[0]
xcord1 = []; ycord1 = []
xcord2 = []; ycord2 = []
for i in range(n):
if int(labelMat[i]) == 1:
xcord1.append(dataArr[i, 1]); ycord1.append(dataArr[i, 2])
else:
xcord2.append(dataArr[i, 1]); ycord2.append(dataArr[i, 2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
ax.scatter(xcord2, ycord2, s=30, c='green')
x = arange(-3.0, 3.0, 0.1)
y = (-weights[0] - weights[1] * x) / weights[2]
ax.plot(x, y)
plt.xlabel('X1'); plt.ylabel('X2');
plt.show()
使用改良梯度上升先拟合出最佳参数,画出图像
示例:从疝气病症预测病马的死亡率,数据包含368个样本,28个特征
分类器函数:
def classifyVector(inX, weights):
prob = sigmoid(sum(inX * weights))
if prob > 0.5:
return 1.0
else:
return 0.0
预处理数据集,以及用改良随机梯度上升训练出最佳参数,再用测试集计算错误率:
def colicTest():
frTrain = open("D:/ML_in_Action/machinelearninginaction/Ch05/horseColicTraining.txt")
frTest = open("D:/ML_in_Action/machinelearninginaction/Ch05/horseColicTest.txt")
trainingSet = []; trainingLabels = []
for line in frTrain.readlines(): # 此处为处理训练数据集
currLine = line.strip().split('\t')
lineArr = []
for i in range(21):
lineArr.append(float(currLine[i]))
trainingSet.append(lineArr)
trainingLabels.append(float(currLine[21]))
trainWeights = stocGradAscent1(array(trainingSet), trainingLabels, 500) # 用改良随机梯度上升 得到最佳参数
errorCount = 0; numTestVec = 0.0
for line in frTest.readlines(): # 此处为处理测试数据集
numTestVec += 1.0
currLine = line.strip().split('\t')
lineArr = []
for i in range(21):
lineArr.append(float(currLine[i]))
if int(classifyVector(array(lineArr), trainWeights)) != int(currLine[21]): # 把最佳参数和测试数据放到sigmoid函数判断是1还是0 ,结果与真实标签比较
errorCount += 1
errorRate = (float(errorCount) / numTestVec)
print("测试集错误率是:%f"%errorRate)
return errorRate
执行十次colicTest() 计算平均错误率:
def multiTest(): # 执行十次colicTest() 计算平均错误率
numTests = 10; errorSum = 0.0
for k in range(numTests):
errorSum += colicTest()
print("经过%d次迭代后,平均错误率为:%f" % (numTests, errorSum / float(numTests)))