logistic回归
- 处理两分类问题
- 涉及模块:numpy,random
1.理论
logistic回归是将线性回归模型的预测值转变为分类的一个模型。
回归模型:
用sigmoid函数将z值转为分类标记:
令正例的概率为:
则:
因此,需要估计参数w和b,通过极大似然法进行估计:
取对数:
求最优解(梯度下降法):
2.函数
- sigmoid函数
- 回归系数的确定(梯度上升法/随机梯度上升法)
- 画出logistic回归最佳拟合直线和数据集
- 分类
3.代码
'''logistic回归(两分类)'''
import numpy as np
import random
# sigmoid函数
def sigmoid(x):
return 1.0/(1+np.exp(-x))
# 回归系数的确定(梯度上升:批处理-计算整个数据集的梯度)
def gradAscent(dataSet,labels):
dataMat = np.mat(dataSet)
# 对于二维矩阵,transpose函数相当于转置
labelMat = np.mat(labels).transpose()
m,n = np.shape(dataMat) # 返回矩阵的行数和列数
alpha = 0.001 # 向目标移动的步长
maxCycles = 500 # 选择迭代次数
# 回归系数初始为1
weights = np.ones((n,1)) # 生成一个n行1列的矩阵(元素均为1)
# 迭代maxCycles次
for k in range(maxCycles):
h = sigmoid(dataMat * weights)
error = labelMat - h
weights += alpha * dataMat.transpose() * error
return weights
# 回归系数的确定(随机梯度上升:在线学习算法)
def stocGradAscent(dataset,labels,numIter=150):
dataMat = np.array(dataset)
m,n = np.shape(dataMat)
weights = np.ones(n)
for num in range(numIter):# 迭代次数
dataIndex = list(range(m))
for i in range(m):# 根据m个随机选择的样本更新回归系数
''' 0.01是为了多次迭代后仍然有一定影响力,α不会为0
1+num+i是根据迭代次数和文本选择的个数多少来调整α
迭代次数越小,调整的梯度越大
注:该参数是作者的方法,实际应用中可以根据自己需要调参'''
alpha = 4/(1.0+num+i) +0.01
randindex = int(random.uniform(0,len(dataIndex))) #随机选择样本
h = sigmoid(sum(dataMat[randindex] * weights))
error = labels[randindex] - h
weights += alpha * error * dataMat[randindex]
del(dataIndex[randindex])
return weights
# 画出logistic回归最佳拟合直线和数据集
def plotBestFit(weights,dataMat,labels):
import matplotlib.pyplot as plt
dataArr = np.array(dataMat)
n = np.shape(dataArr)[0] # 返回数据集的行数
xcord1 = [];ycord1 = []
xcord2 = [];ycord2 = []
for i in range(n):
if int(labels[i]) == 1:# 按照类别将数据集分开
xcord1.append(dataArr[i,1])
ycord1.append(dataArr[i,2])
else:
xcord2.append(dataArr[i,1])
ycord2.append(dataArr[i,2])
plt.figure()
plt.scatter(xcord1,ycord1,s=30,c='red',marker='s')
plt.scatter(xcord2,ycord2,s=30,c='green')
x = np.arange(-3.0,3.0,0.1) # 根据数据本身设定上下限和间距
y = (-weights[0]-weights[1]*x)/weights[2]
''' 根据z=w1x1+w2x2+b,b等价于w0x0(x0设置为1)
x1在图中是x轴的元素,x2是y轴的元素,z=w0x0+w1x+w2y
由于需要用sigmoid(z)将z分类,z=0是sigmoid(z)的分界点
所以w0x0+w1x+w2y=0,y=(-w0x0-w1x)/w2'''
plt.plot(x,y)
plt.xlabel('X1')
plt.ylabel('X2')
plt.show()
# 分类
def classifyLog(vec,weights):
p = sigmoid(sum(vec * weights))
if p > 0.5:
return 1.0
else:
return 0.0
4.例子
# 预测病马的死亡率
import logistic
import numpy as np
f_train = open('horseColicTraining.txt')
f_test = open('horseColicTest.txt')
trainset=[];trainlabels=[]
for line in f_train.readlines():
linec = line.strip().split('\t')
linearr = []
for i in range(21):
linearr.append(float(linec[i]))
trainset.append(linearr)
trainlabels.append(float(linec[21]))
test_num = 0.0;testset = [];testlabels = []
for line in f_test.readlines():
test_num += 1
linec = line.strip().split('\t')
testlabels.append(int(linec[21]))
linearr=[]
for i in range(21):
linearr.append(float(linec[i]))
testset.append(linearr)
def errorRate():
trainweights = logistic.stocGradAscent(trainset,trainlabels,500)
errorCount = 0
m,n = np.shape(testset)
for i in range(m):
if int(logistic.classifyLog(np.array(testset[i]),trainweights))\
!= testlabels[i]:
errorCount += 1
errorRate = float(errorCount)/test_num
return errorRate
iter_num = 10;error_sum = 0.0
for i in range(iter_num):
error_sum += errorRate()
error_avg = error_sum/iter_num
print('after %d iteration, average error rate is %f'%(iter_num,error_avg))