一、概述
逻辑回归分析是对定性变量的回归分析,机对变量进行分类预测,如预测是否为垃圾邮件,某人的信用是否为良好等。
二、算法推导
对于线性回归模型,有如下表达方式:
而对于线性回归预测模型,其取值是连续的,不能够满足作为分类值的需求,如需将预测值全部归并到0和1范围内,需要进行如下逻辑变换,
即:
将函数进行归并化简得到如下:
在进行分类时:
假设Y是0-1型变量,其损失函数为,可通过似然函数求对数得到:
化简可以得到:
对损失函数进行梯度下降
三、算法Python实现
python算法实现如下:
import numpy as np
from sklearn.model_selection import train_test_split
def sigmod(x):
return 1/(1+np.exp(-x))
def lr(x_train,y_train):
max_loop = 500000
threshold = 0.01
alpha = 0.01
m,n= x_train.shape
theta = np.random.rand(n)
cnt = 0
while cnt<max_loop:
diff = np.full(n, 0.0)
for i in range(m):
diff+=(y_train[i]-sigmod(theta.T@x_train[i]))*x_train[i]
print(diff)
theta = theta+alpha*diff/m
if (abs(diff)<threshold).all():
break
cnt+=1
print(cnt)
return theta
def lr_predict(theta,x_test):
if sigmod(theta.T@x_test)>0.5:
return 1
else:
return 0
if __name__ =="__main__":
dataset = np.loadtxt('data.txt')
x_train = dataset[:, 0:-1]
y_train = dataset[:, -1]
x0 = np.ones((len(y_train), 1))
x_train = np.concatenate((x0,x_train),axis=1)
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.3)
theta = lr(x_train, y_train)
y_pred = []
for i in range(len(y_test)):
y_pred.append(lr_predict(theta,x_test[i]))
print(y_test == y_pred)
输出:
468405
[ 0.00999997 0.00046538 -0.00128062]
[ True True True True True True True True True True True True
True True True True True True True True False True True True
True True True True True True]
考虑到在使用梯度下降求解的时候采用的是批量梯度下降,运算量较大,为减少运算量,也可使用随机梯度下降法,代码实现如下:
import numpy as np
from sklearn.model_selection import train_test_split
def sigmod(x):
return 1/(1+np.exp(-x))
def lr(x_train,y_train):
max_loop = 500000
threshold = 0.01
alpha = 0.001
m,n= x_train.shape
theta = np.random.rand(n)
cnt = 0
while cnt<max_loop:
diff = np.full(n, 0.0)
for i in range(m):
diff=(y_train[i]-sigmod(theta.T@x_train[i]))*x_train[i]
theta = theta+alpha*diff
if (abs(diff)<threshold).all():
break
cnt+=1
print(cnt)
return theta
def lr_predict(theta,x_test):
if sigmod(theta.T@x_test)>0.5:
return 1
else:
return 0
if __name__ =="__main__":
dataset = np.loadtxt('data.txt')
x_train = dataset[:, 0:-1]
y_train = dataset[:, -1]
x0 = np.ones((len(y_train), 1))
x_train = np.concatenate((x0,x_train),axis=1)
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.3)
theta = lr(x_train, y_train)
y_pred = []
for i in range(len(y_test)):
y_pred.append(lr_predict(theta,x_test[i]))
print(y_test == y_pred)
四、算法Sklearn实现
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import numpy as np
dataset = np.loadtxt("data.txt")
x_train,x_test,y_train,y_test = train_test_split(dataset[:,0:-1],dataset[:,-1],test_size=0.3)
model = LogisticRegression()
model.fit(x_train,y_train)
print(y_test==model.predict(x_test))
输出:
[ True True True False True True False True True True True True
True False False True True True True True True True True True
True True True True True True]
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
import numpy as np
dataset = np.loadtxt("data.txt")
x_train,x_test,y_train,y_test = train_test_split(dataset[:,0:-1],dataset[:,-1],test_size=0.3)
model = SGDClassifier(alpha=0.001)
model.fit(x_train,y_train)
print(y_test==model.predict(x_test))
输出:
[ True True True True True True True True False True False True
False True True True False False True True True True True True
True True True True True True]