逻辑回归
逻辑回归是一种分类模型:
z
=
W
T
X
=
w
0
+
w
1
x
1
+
w
2
x
2
+
⋯
+
w
n
x
n
z=W^{T}X=w^{0}+w^{1}x^{1}+w^{2}x^{2}+\cdots+w^{n}x^{n}
z = W T X = w 0 + w 1 x 1 + w 2 x 2 + ⋯ + w n x n 逻辑回归是通过sigmoid函数将输入值映射到
[
0
,
1
]
[0,1]
[ 0 , 1 ] 的区间范围
p
=
s
(
z
)
=
1
1
+
e
−
z
p=s(z)=\frac{1}{1+e^{-z}}
p = s ( z ) = 1 + e − z 1 `
目标函数
逻辑回归的目标函数为:
J
(
W
)
=
−
∑
i
=
1
n
[
y
(
i
)
l
o
g
(
s
(
z
(
i
)
)
)
+
(
1
−
y
(
i
)
)
l
o
g
(
1
−
s
(
z
(
i
)
)
)
]
J(W)=-\sum_{i=1}^{n}[y^{(i)}log(s(z^{(i)}))+(1-y^{(i)})log(1-s(z^{(i)}))]
J ( W ) = − ∑ i = 1 n [ y ( i ) l o g ( s ( z ( i ) ) ) + ( 1 − y ( i ) ) l o g ( 1 − s ( z ( i ) ) ) ] 因此,目标函数最小时,
W
W
W 的值就是我们要求的最终权重值
代码实现
1.数据预处理
import numpy as np
import pandas as pd
data= pd. read_csv( r"F:\数据集\Iris数据集\iris.csv" )
data= data. rename( columns= { "Unnamed: 0" : "id" } )
data
data. drop( "id" , axis= 1 , inplace= True )
data. drop_duplicates( inplace= True )
data[ "Species" ] = data[ "Species" ] . map ( { "versicolor" : 0 , "setosa" : 1 , "virginica" : 2 } )
data= data[ data[ "Species" ] != 2 ]
2.编写逻辑回归的类
class LogisticRegression :
"""
使用python语言实现逻辑回归算法
"""
def __init__ ( self, alpha, times) :
"""
初始化方法
Parameters:
_____________
alpha:float
学习率
times:int
迭代次数
"""
self. alpha= alpha
self. times= times
def sigmoid ( self, z) :
"""
sigmoid函数的实现
Parameters:
______________
z:float
自变量,值为z=w.T*x
Returns:
————————
p:float,值为[0,1]之间。
返回样本属于类别1的概率值,用来作为结果的预测
当s>=0.5(z>=0)时,判定为类别1,否则判定为类别0.
"""
return 1.0 / ( 1.0 + np. exp( - z) )
def fit ( self, X, y) :
"""
根据提供的训练数据,对模型进行训练
Parameters:
------------
X:类数组类型,形状:[样本数量,特征数量]
待训练的样本特征属性
y:类数组类型,形状:[样本数量]
每个样本的目标值。(标签)
"""
X= np. asarray( X)
y= np. asarray( y)
self. w_= np. zeros( 1 + X. shape[ 1 ] )
self. loss_= [ ]
for i in range ( self. times) :
z= np. dot( X, self. w_[ 1 : ] ) + self. w_[ 0 ]
p= self. sigmoid( z)
cost= - np. sum ( y* np. log( p) + ( 1 - y) * np. log( 1 - p) )
self. loss_. append( cost)
self. w_[ 0 ] += self. alpha* np. sum ( y- p)
self. w_[ 1 : ] += self. alpha* np. dot( X. T, y- p)
def predict_proba ( self, X) :
"""
根据参数传递的样本对样本数据进行预测
Parameters:
___________
X:类数组类型,形状:[样本数量,特征数量]
待测试的样本特征(属性)
Returns:
-------------
result:数组类型
预测的结果(概率值)
"""
X= np. asarray( X)
z= np. dot( X, self. w_[ 1 : ] ) + self. w_[ 0 ]
p= self. sigmoid( z)
p= p. reshape( - 1 , 1 )
return np. concatenate( [ 1 - p, p] , axis= 1 )
def predict ( self, X) :
"""
根据参数传递的样本,对样本数据进行预测
Parameters:
------------
X:类数组类型,形状:[样本数量,特征数量]
待测试的样本特征(属性)
Returns:
------------
result:数组类型。
预测的结果(分类值)
"""
return np. argmax( self. predict_proba( X) , axis= 1 )
3.测试逻辑回归的类
t1= data[ data[ "Species" ] == 0 ]
t2= data[ data[ "Species" ] == 1 ]
t1= t1. sample( len ( t1) , random_state= 0 )
t2= t2. sample( len ( t2) , random_state= 0 )
train_X= pd. concat( [ t1. iloc[ : 40 , : - 1 ] , t2. iloc[ : 40 , : - 1 ] ] , axis= 0 )
train_y= pd. concat( [ t1. iloc[ : 40 , - 1 ] , t2. iloc[ : 40 , - 1 ] ] , axis= 0 )
test_X= pd. concat( [ t1. iloc[ 40 : , : - 1 ] , t2. iloc[ 40 : , : - 1 ] ] , axis= 0 )
test_y= pd. concat( [ t1. iloc[ 40 : , - 1 ] , t2. iloc[ 40 : , - 1 ] ] , axis= 0 )
lr= LogisticRegression( alpha= 0.01 , times= 20 )
lr. fit( train_X, train_y)
result= lr. predict( test_X)
np. sum ( result== test_y) / len ( test_y)
1.0
可视化
import matplotlib as mpl
import matplotlib. pyplot as plt
mpl. rcParams[ "font.family" ] = "SimHei"
mpl. rcParams[ "axes.unicode_minus" ] = False
plt. plot( result, "ro" , ms= 15 , label= "预测值" )
plt. plot( test_y. values, "go" , label= "真实值" )
plt. title( "逻辑回归" )
plt. xlabel( "样本序号" )
plt. ylabel( "类别" )
plt. legend( )
plt. show( )
plt. plot( range ( 1 , lr. times+ 1 ) , lr. loss_, "go-" )
[<matplotlib.lines.Line2D at 0x562b4bbe48>]