【机器学习与算法】python手写算法:带正则化的逻辑回归
背景
逻辑回归原理、损失函数推导、损失函数梯度推导不再赘述
实现功能:
1、正则化:不带正则化、L1正则化、L2正则化
2、求解参数方法:梯度下降、坐标轴下降
代码
import pandas as pd
import numpy as np
import copy
class LogisticRegression:
def __init__(self, penalty = None, lbd = 1.0, solver='GD'):
self.sigmoid = lambda x:1/(1 + np.exp(-x))
self.w = None
self.penalty = penalty
self.lbd = lbd
self.solver = solver
def GradientDescent(self, X, y, iter_nums = 1000, learning_rate=0.001, silent=True):
'''
n个样本,m个变量(算上常数项)
'''
#初始化系数self.w(1 x m)
self.w = np.random.uniform(-0.001,0.001,X.shape[1])
for n in range(iter_nums):
#求取Z(1 x n):
z = self.sigmoid(np.dot(self.w,X.values.T))
#梯度(m x n):
gradient = (y.values-z)*X.values.T
#(1 x m)
gradient = -np.sum(gradient,axis=1)
#加入l1和l2正则项
if self.penalty == 'l2':
gradient = gradient + self.lbd*self.w
elif self.penalty == 'l1':
gradient = gradient + self.lbd*np.array(list(map(lambda x:1 if x>0 else -1,self.w)))
#沿着负梯度下降
self.w = self.w - gradient*learning_rate
#当结果闭合时退出
out_flag = np.array(list(map(lambda x:abs(x)<1e-3, gradient)))
if out_flag.all():
break
#控制是否打印迭代过程
if not silent:
if n%50==0:
print('Round {}:\n'.format(n),self.w)
def CoordinateDescent(self, X, y, iter_nums = 200, learning_rate=0.001):
'''
n个样本,m个变量(算上常数项)
'''
#初始化系数self.w(1 x m)
self.w = np.random.uniform(-0.001,0.001,X.shape[1])
for n in range(iter_nums):
out_w = copy.copy(self.w)
for i,item in enumerate(self.w):
#在每一个坐标维度上找到使损失函数达到最小的点
for s in range(200):
z = self.sigmoid(np.dot(self.w,X.values.T))
gradient = -np.sum((y.values-z)*X.values.T[i,:])
self.w[i] = self.w[i] - gradient*learning_rate
if abs(gradient)<1e-3:
break
out_w = np.array(list(map(lambda x:abs(x)<1e-3, out_w-self.w)))
if out_w.all():
break
def fit(self, X:pd.DataFrame, y:pd.Series):
#加一列常数项
X['const'] = 1
self.__setattr__('X_columns',X.columns.tolist())
if self.solver == 'GD':
self.GradientDescent(X, y)
elif self.solver == 'CD':
self.CoordinateDescent(X, y)
def predict(self, X):
print('final coef:\n',pd.Series(self.w,index=self.X_columns,name='coef'))
if 'const' not in X:
X['const'] = 1
P = pd.Series([0]*X.shape[0])
for i,item in enumerate(self.X_columns):
P = P + X[item]*self.w[i]
P = P.apply(self.sigmoid)
return P
if __name__ == '__main__':
lr = LogisticRegression(solver='GD')
lr.fit(X,Y)
p = lr.predict(X)
输出结果
1、两种求解方法结果
- 坐标轴下降法求解
df = pd.read_csv(......)
X = df[[x for x in df if x!='y']]
Y = df['y']
lr = LogisticRegression(solver='CD')
lr.fit(X,Y)
p = lr.predict(X)
#OUTPUT:
final coef:
V1 -0.329929
V2 0.204230
V3 0.289618
V4 0.117988
V5 -0.070303
const -3.182506
Name: coef, dtype: float64
- 梯度下降法求解
lr = LogisticRegression(solver='GD')
lr.fit(X,Y)
p = lr.predict(X)
#OUTPUT
final coef:
V1 -0.330163
V2 0.204261
V3 0.289825
V4 0.114231
V5 -0.074305
const -3.182503
Name: coef, dtype: float64
- 和statsmodel库的结果进行对比
import statsmodels.api as sm
clf = sm.Logit(Y,X).fit()
clf.summary()
#OUTPUT
Logit Regression Results
Dep. Variable: y No. Observations: 5000
Model: Logit Df Residuals: 4994
Method: MLE Df Model: 5
Date: Tue, 31 Mar 2020 Pseudo R-squ.: 0.02706
Time: 18:17:43 Log-Likelihood: -868.85
converged: True LL-Null: -893.01
LLR p-value: 3.040e-09
coef std err z P>|z| [0.025 0.975]
V1 -0.3302 0.150 -2.196 0.028 -0.625 -0.036
V2 0.2043 0.049 4.164 0.000 0.108 0.300
V3 0.2898 0.059 4.882 0.000 0.173 0.406
V4 0.1142 0.139 0.819 0.413 -0.159 0.388
V5 -0.0744 0.195 -0.382 0.703 -0.456 0.308
const -3.1825 0.075 -42.630 0.000 -3.329 -3.036
2、两种正则化结果
- L2正则化
lr = LogisticRegression(penalty='l2')
lr.fit(X,Y)
p = lr.predict(X)
#OUTPUT
final coef:
V1 -0.315437
V2 0.201855
V3 0.285692
V4 0.112852
V5 -0.065936
const -3.163898
Name: coef, dtype: float64
#Compare with sklearn output:
from sklearn.linear_model import LogisticRegression as LR
clf = LR(penalty='l2',fit_intercept=False)
clf.fit(X,Y)
clf.coef_
#OUTPUT
array([[-0.31545809, 0.20185523, 0.28569328, 0.11282428, -0.06598395,
-3.16389876]])
- L1正则化
lr = LogisticRegression(penalty='l1')
lr.fit(X,Y)
p = lr.predict(X)
#OUTPUT
final coef:
V1 -0.290285
V2 0.201277
V3 0.285892
V4 0.115066
V5 -0.039643
const -3.172344
Name: coef, dtype: float64
#Compare with sklearn output:
clf = LR(penalty='l1',fit_intercept=False)
clf.fit(X,Y)
clf.coef_
#OUTPUT
array([[-0.29037545, 0.20127795, 0.2858939 , 0.11495548, -0.03985233,
-3.17235054]])
最后,欢迎阅读其它算法的python实现:
【机器学习与算法】python手写算法:Cart树
【机器学习与算法】python手写算法:带正则化的逻辑回归
【机器学习与算法】python手写算法:xgboost算法
【机器学习与算法】python手写算法:Kmeans和Kmeans++算法
【机器学习与算法】python手写算法:softmax回归