机器学习经典算法之(二十)逻辑回归
(一)逻辑回归简介:
logistic回归又称logistic回归分析,是一种广义的线性回归分析模型,常用于数据挖掘,疾病自动诊断,广告投放、邮件判断等领域。
该算法可根据已知的一系列因变量估计离散数值(比方说二进制数值 0 或 1 ,是或否,真或假)。简单来说,它通过将数据拟合进一个逻辑函数来预估一个事件出现的概率。因此,它也被叫做逻辑回归。因为它预估的是概率,所以它的输出值大小在 0 和 1 之间(正如所预计的一样)。
Logistic回归模型的适用条件
1 因变量为二分类的分类变量或某事件的发生率,并且是数值型变量。但是需要注意,重复计数现象指标不适用于Logistic回归。
2 残差和因变量都要服从二项分布。二项分布对应的是分类变量,所以不是正态分布,进而不是用最小二乘法,而是最大似然法来解决方程估计和检验问题。
3 自变量和Logistic概率是线性关系
4 各观测对象间相互独立。
Logistic应用场景:
寻因:寻找某一疾病的危险因素等。
预测:如果已经建立了logistic回归模型,则可以根据模型,预测在不同的自变量情况下,发生某病或某种情况的概率有多大。
判别:实际上跟预测有些类似,也是根据logistic模型,判断某人属于某病或属于某种情况的概率有多大,也就是看一下这个人有多大的可能性是属于某病。
这是logistic回归最常用的三个用途,实际中的logistic回归用途是极为广泛的,logistic回归几乎已经成了流行病学和医学中最常用的分析方法。
(二)原理实现代码(参考考机器实战)
from numpyimport *
defloadDataSet():
dataMat = []; labelMat = []
fr = open('testSet.txt')
for line in fr.readlines():
lineArr = line.strip().split()
dataMat.append([1.0, float(lineArr[0]),float(lineArr[1])])
labelMat.append(int(lineArr[2]))
return dataMat,labelMat
defsigmoid(inX):
return 1.0/(1+exp(-inX))
defgradAscent(dataMatIn, classLabels):
dataMatrix = mat(dataMatIn) #convert to NumPy matrix
labelMat = mat(classLabels).transpose()#convert to NumPy matrix
m,n = shape(dataMatrix)
alpha= 0.001
maxCycles = 500
weights = ones((n,1))
for k in range(maxCycles): #heavy on matrix operations
h = sigmoid(dataMatrix*weights) #matrix mult
error = (labelMat - h) #vector subtraction
weights = weights + alpha * dataMatrix.transpose()* error #matrix mult
return weights
defplotBestFit(weights):
import matplotlib.pyplot as plt
dataMat,labelMat=loadDataSet()
dataArr = array(dataMat)
n = shape(dataArr)[0]
xcord1 = []; ycord1 = []
xcord2 = []; ycord2 = []
for i in range(n):
if int(labelMat[i])== 1:
xcord1.append(dataArr[i,1]);ycord1.append(dataArr[i,2])
else:
xcord2.append(dataArr[i,1]);ycord2.append(dataArr[i,2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord1, ycord1, s=30, c='red',marker='s')
ax.scatter(xcord2, ycord2, s=30, c='green')
x = arange(-3.0, 3.0, 0.1)
y = (-weights[0]-weights[1]*x)/weights[2]
ax.plot(x, y)
plt.xlabel('X1'); plt.ylabel('X2');
plt.show()
defstocGradAscent0(dataMatrix, classLabels):
m,n = shape(dataMatrix)
alpha = 0.01
weights = ones(n) #initialize to all ones
for i in range(m):
h = sigmoid(sum(dataMatrix[i]*weights))
error = classLabels[i] - h
weights = weights + alpha * error *dataMatrix[i]
return weights
defstocGradAscent1(dataMatrix, classLabels, numIter=150):
m,n = shape(dataMatrix)
weights = ones(n) #initialize to all ones
for j in range(numIter):
dataIndex = range(m)
for i in range(m):
alpha = 4/(1.0+j+i)+0.0001 #apha decreases with iteration, does not
randIndex =int(random.uniform(0,len(dataIndex)))#go to 0 because of the constant
h =sigmoid(sum(dataMatrix[randIndex]*weights))
error = classLabels[randIndex] - h
weights = weights + alpha * error *dataMatrix[randIndex]
del(dataIndex[randIndex])
return weights
defclassifyVector(inX, weights):
prob = sigmoid(sum(inX*weights))
if prob > 0.5: return 1.0
else: return 0.0
def colicTest():
frTrain = open('horseColicTraining.txt');frTest = open('horseColicTest.txt')
trainingSet = []; trainingLabels = []
for line in frTrain.readlines():
currLine = line.strip().split('\t')
lineArr =[]
for i in range(21):
lineArr.append(float(currLine[i]))
trainingSet.append(lineArr)
trainingLabels.append(float(currLine[21]))
trainWeights =stocGradAscent1(array(trainingSet), trainingLabels, 1000)
errorCount = 0; numTestVec = 0.0
for line in frTest.readlines():
numTestVec += 1.0
currLine = line.strip().split('\t')
lineArr =[]
for i in range(21):
lineArr.append(float(currLine[i]))
if int(classifyVector(array(lineArr),trainWeights))!= int(currLine[21]):
errorCount += 1
errorRate = (float(errorCount)/numTestVec)
print "the error rate of this test is:%f" % errorRate
return errorRate
def multiTest():
numTests = 10; errorSum=0.0
for k in range(numTests):
errorSum += colicTest()
print "after %d iterations the averageerror rate is: %f" % (numTests, errorSum/float(numTests))
(三)sklearn中应用举例:
以鸢尾花数据集为例,先进行数据降维,绘制出数据点,然后利用逻辑回归,多类采用ovr策略,观察模型的分类情况。
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import numpy asnp
irisData=load_iris()
X_train=irisData.data
y_train=irisData.target
pca=PCA(n_components=2)
X=pca.fit_transform(X_train)
f=plt.figure()
ax=f.add_subplot(1,1,1)
ax.plot(X[:,0][y_train==0],X[:,1][y_train==0],'bo')
ax.scatter(X[:,0][y_train==1],X[:,1][y_train==1],c='r')
ax.scatter(X[:,0][y_train==2],X[:,1][y_train==2],c='y')
ax.set_title('Data')
plt.show()
clf=LogisticRegression(multi_class='ovr',solver='lbfgs',class_weight={0:1,1:1,2:1})
clf.fit(X,y_train)
score=clf.score(X,y_train)
x0min,x0max=X[:,0].min(),X[:,0].max()
x1min,x1max=X[:,1].min(),X[:,1].max()
h=0.05
xx,yy=np.meshgrid(np.arange(x0min-1,x0max+1,h),np.arange(x1min-1,x1max+1,h))
x_=xx.reshape([xx.shape[0]*xx.shape[1],1])
y_=yy.reshape([yy.shape[0]*yy.shape[1],1])
test_x=np.c_[x_,y_]
test_predict=clf.predict(test_x)
z=test_predict.reshape(xx.shape)
plt.contourf(xx,yy,z,cmap=plt.cm.Paired)
plt.axis('tight')
colors='bgy'
for i,color inzip(clf.classes_,colors):
idx=np.where(y_train==i)
plt.scatter(X[idx,0],X[idx,1],c=color,cmap=plt.cm.Paired)
plt.title("score:%s"%score)
plt.show()