from matplotlib. font_manager import FontProperties
import matplotlib. pyplot as plt
import numpy as np
import random
"""
Desc:
梯度上升算法测试函数,求函数f(x) = -x^2+4x的极大值
Parameters:
None
Returns:
None
"""
def Gradient_Ascent_test ( ) :
def f_prime ( x_old) :
return - 2 * x_old + 4
x_old = - 1
x_new = 0
alpha = 0.01
presision = 0.00000001
while abs ( x_new - x_old) > presision:
x_old = x_new
x_new = x_old + alpha * f_prime( x_old)
print ( x_new)
"""
Desc:
加载数据
Parameters:
None
Returns:
dataMat - 数据列表
labelMat - 标签列表
"""
def loadDataSet ( ) :
dataMat = [ ]
labelMat = [ ]
fr = open ( 'testSet.txt' )
for line in fr. readlines( ) :
lineArr = line. strip( ) . split( )
dataMat. append( [ 1.0 , float ( lineArr[ 0 ] ) , float ( lineArr[ 1 ] ) ] )
labelMat. append( int ( lineArr[ 2 ] ) )
fr. close( )
return dataMat, labelMat
"""
Desc:
绘制数据集
Parameters:
weights - 权重参数数组
Returns:
None
"""
def plotBestFit ( weights) :
dataMat, labelMat = loadDataSet( )
dataArr = np. array( dataMat)
n = np. shape( dataMat) [ 0 ]
xcord1 = [ ]
ycord1 = [ ]
xcord2 = [ ]
ycord2 = [ ]
for i in range ( n) :
if int ( labelMat[ i] ) == 1 :
xcord1. append( dataArr[ i, 1 ] )
ycord1. append( dataArr[ i, 2 ] )
else :
xcord2. append( dataArr[ i, 1 ] )
ycord2. append( dataArr[ i, 2 ] )
fig = plt. figure( )
ax = fig. add_subplot( 111 )
ax. scatter( xcord1, ycord1, s= 20 , c= 'red' , marker= 's' , alpha= .5 )
ax. scatter( xcord2, ycord2, s= 20 , c= 'green' , alpha= .5 )
x = np. arange( - 3.0 , 3.0 , 0.1 )
y = ( - weights[ 0 ] - weights[ 1 ] * x) / weights[ 2 ]
ax. plot( x, y)
plt. title( 'BestFit' )
plt. xlabel( 'x1' )
plt. ylabel( 'y2' )
plt. show( )
"""
Desc:
sigmoid函数
Parameters:
inX - 数据
Returns:
sigmoid函数
"""
def sigmoid ( inX) :
return 1.0 / ( 1 + np. exp( - inX) )
"""
Desc:
梯度上升法
Parameters:
dataMath - 数据集
classLabels - 数据标签
Returns:
weights.getA() - 求得的权重数组(最优参数)
weights_array - 每次更新的回归系数
"""
def gradAscent ( dataMath, classLabels) :
dataMatrix = np. mat( dataMath)
labelMat = np. mat( classLabels) . transpose( )
m, n = np. shape( dataMatrix)
alpha = 0.01
maxCycles = 500
weights = np. ones( ( n, 1 ) )
weights_array = np. array( [ ] )
for k in range ( maxCycles) :
h = sigmoid( dataMatrix * weights)
error = labelMat - h
weights = weights + alpha * dataMatrix. transpose( ) * error
weights_array = np. append( weights_array, weights)
weights_array = weights_array. reshape( maxCycles, n)
return weights. getA( ) , weights_array
"""
Desc:
改进的随机梯度上升法
Parameters:
dataMatrix - 数据数组
classLabels - 数据标签
numIter - 迭代次数
Returns:
weights - 求得的回归系数数组(最优参数)
weights_array - 每次更新的回归系数
"""
def stocGradAscent1 ( dataMatrix, classLabels, numIter= 150 ) :
m, n = np. shape( dataMatrix)
weights = np. ones( n)
weights_array = np. array( [ ] )
for j in range ( numIter) :
dataIndex = list ( range ( m) )
for i in range ( m) :
alpha = 4 / ( 1.0 + j+ i) + 0.01
randIndex = int ( random. uniform( 0 , len ( dataIndex) ) )
h = sigmoid( sum ( dataMatrix[ randIndex] * weights) )
error = classLabels[ randIndex] - h
weights = weights + alpha * error * dataMatrix[ randIndex]
weights_array = np. append( weights_array, weights, axis= 0 )
del ( dataIndex[ randIndex] )
weights_array = weights_array. reshape( numIter* m, n)
return weights, weights_array
"""
Desc:
绘制回归系数与迭代次数的关系
Parameters:
weights_array1 - 回归系数数组1
weights_array2 - 回归系数数组2
Returns:
None
"""
def plotWeights ( weights_array1, weights_array2) :
font = FontProperties( fname= r"C:\Windows\Fonts\simsun.ttc" , size= 14 )
fig, axs = plt. subplots( nrows= 3 , ncols= 2 , sharex= False , sharey= False , figsize= ( 20 , 10 ) )
x1 = np. arange( 0 , len ( weights_array1) , 1 )
axs[ 0 ] [ 0 ] . plot( x1, weights_array1[ : , 0 ] )
axs0_title_text = axs[ 0 ] [ 0 ] . set_title( u'改进的梯度上升算法,回归系数与迭代次数关系' , FontProperties= font)
axs0_ylabel_text = axs[ 0 ] [ 0 ] . set_ylabel( u'w0' , FontProperties= font)
plt. setp( axs0_title_text, size= 20 , weight= 'bold' , color= 'black' )
plt. setp( axs0_ylabel_text, size= 20 , weight= 'bold' , color= 'black' )
axs[ 1 ] [ 0 ] . plot( x1, weights_array1[ : , 1 ] )
axs1_ylabel_text = axs[ 1 ] [ 0 ] . set_ylabel( u'w1' , FontProperties= font)
plt. setp( axs1_ylabel_text, size= 20 , weight= 'bold' , color= 'black' )
axs[ 2 ] [ 0 ] . plot( x1, weights_array1[ : , 2 ] )
axs2_title_text = axs[ 2 ] [ 0 ] . set_title( u'迭代次数' , FontProperties= font)
axs2_ylabel_text = axs[ 2 ] [ 0 ] . set_ylabel( u'w2' , FontProperties= font)
plt. setp( axs2_title_text, size= 20 , weight= 'bold' , color= 'black' )
plt. setp( axs2_ylabel_text, size= 20 , weight= 'bold' , color= 'black' )
x2 = np. arange( 0 , len ( weights_array2) , 1 )
axs[ 0 ] [ 1 ] . plot( x2, weights_array2[ : , 0 ] )
axs0_title_text = axs[ 0 ] [ 1 ] . set_title( u'梯度上升算法,回归系数与迭代次数关系' , FontProperties= font)
axs0_ylabel_text = axs[ 0 ] [ 1 ] . set_ylabel( u'w0' , FontProperties= font)
plt. setp( axs0_title_text, size= 20 , weight= 'bold' , color= 'black' )
plt. setp( axs0_ylabel_text, size= 20 , weight= 'bold' , color= 'black' )
axs[ 1 ] [ 1 ] . plot( x2, weights_array2[ : , 1 ] )
axs1_ylabel_text = axs[ 1 ] [ 1 ] . set_ylabel( u'w1' , FontProperties= font)
plt. setp( axs1_ylabel_text, size= 20 , weight= 'bold' , color= 'black' )
axs[ 2 ] [ 1 ] . plot( x2, weights_array2[ : , 2 ] )
axs2_title_text = axs[ 2 ] [ 1 ] . set_title( u'迭代次数' , FontProperties= font)
axs2_ylabel_text = axs[ 2 ] [ 1 ] . set_ylabel( u'w2' , FontProperties= font)
plt. setp( axs2_title_text, size= 20 , weight= 'bold' , color= 'black' )
plt. setp( axs2_ylabel_text, size= 20 , weight= 'bold' , color= 'black' )
plt. show( )
if __name__ == '__main__' :
dataMat, labelMat = loadDataSet( )
weights2, weights_array2 = gradAscent( dataMat, labelMat)
weights1, weights_array1 = stocGradAscent1( np. array( dataMat) , labelMat)
plotWeights( weights_array1, weights_array2)
import numpy as np
import random
"""
Desc:
sigmoid函数
Parameters:
inX - 数据
Returns:
sigmoid函数
"""
def sigmoid ( inX) :
return 1.0 / ( 1 + np. exp( - inX) )
"""
Desc:
梯度上升法
Parameters:
dataMath - 数据集
classLabels - 数据标签
Returns:
weights.getA() - 求得的权重数组(最优参数)
weights_array - 每次更新的回归系数
"""
def gradAscent ( dataMath, classLabels) :
dataMatrix = np. mat( dataMath)
labelMat = np. mat( classLabels) . transpose( )
m, n = np. shape( dataMatrix)
alpha = 0.01
maxCycles = 500
weights = np. ones( ( n, 1 ) )
for k in range ( maxCycles) :
h = sigmoid( dataMatrix * weights)
error = labelMat - h
weights = weights + alpha * dataMatrix. transpose( ) * error
return weights. getA( )
"""
Desc:
改进的随机梯度上升法
Parameters:
dataMatrix - 数据数组
classLabels - 数据标签
numIter - 迭代次数
Returns:
weights - 求得的回归系数数组(最优参数)
"""
def stocGradAscent1 ( dataMatrix, classLabels, numIter= 150 ) :
m, n = np. shape( dataMatrix)
weights = np. ones( n)
for j in range ( numIter) :
dataIndex = list ( range ( m) )
for i in range ( m) :
alpha = 4 / ( 1.0 + j+ i) + 0.01
randIndex = int ( random. uniform( 0 , len ( dataIndex) ) )
h = sigmoid( sum ( dataMatrix[ randIndex] * weights) )
error = classLabels[ randIndex] - h
weights = weights + alpha * error * dataMatrix[ randIndex]
del ( dataIndex[ randIndex] )
return weights
"""
Desc:
用python写的Logistic分类器做预测
Parameters:
None
Returns:
None
"""
def colicTest ( ) :
frTrain = open ( 'horseColicTraining.txt' )
frTest = open ( 'horseColicTest.txt' )
trainingSet = [ ]
trainingLabels = [ ]
for line in frTrain. readlines( ) :
currLine = line. strip( ) . split( '\t' )
lineArr = [ ]
for i in range ( len ( currLine) - 1 ) :
lineArr. append( float ( currLine[ i] ) )
trainingSet. append( lineArr)
trainingLabels. append( float ( currLine[ - 1 ] ) )
trainWeights = gradAscent( np. array( trainingSet) , trainingLabels)
errorCount = 0
numTestVect = 0.0
for line in frTest. readlines( ) :
numTestVect += 1.0
currLine = line. strip( ) . split( '\t' )
lineArr = [ ]
for i in range ( len ( currLine) - 1 ) :
lineArr. append( float ( currLine[ i] ) )
if int ( classifyVector( np. array( lineArr) , trainWeights[ : , 0 ] ) ) != int ( currLine[ - 1 ] ) :
errorCount += 1
errorRate = ( float ( errorCount) / numTestVect) * 100
print ( "测试集错误率为:%.2f%%" % errorRate)
"""
Desc:
分类函数
Parameters:
inX - 特征向量
weights - 回归系数
Returns:
分类结果
"""
def classifyVector ( inX, weights) :
prob = sigmoid( sum ( inX * weights) )
if prob > 0.5 :
return 1.0
else :
return 0.0
if __name__ == '__main__' :
colicTest( )
'''
测试集错误率为:28.36%
'''
from sklearn. linear_model import LogisticRegression
"""
Desc:
使用Sklearn构建Logistic回归分类器
Parameters:
None
Returns:
None
"""
def colicSklearn ( ) :
frTrain = open ( 'horseColicTraining.txt' )
frTest = open ( 'horseColicTest.txt' )
trainingSet = [ ]
trainingLabels = [ ]
testSet = [ ]
testLabels = [ ]
for line in frTrain. readlines( ) :
currLine = line. strip( ) . split( '\t' )
lineArr = [ ]
for i in range ( len ( currLine) - 1 ) :
lineArr. append( float ( currLine[ i] ) )
trainingSet. append( lineArr)
trainingLabels. append( float ( currLine[ - 1 ] ) )
for line in frTest. readlines( ) :
currLine = line. strip( ) . split( '\t' )
lineArr = [ ]
for i in range ( len ( currLine) - 1 ) :
lineArr. append( float ( currLine[ i] ) )
testSet. append( lineArr)
testLabels. append( float ( currLine[ - 1 ] ) )
classifier = LogisticRegression( solver= 'liblinear' , max_iter= 10 ) . fit( trainingSet, trainingLabels)
test_accurcy = classifier. score( testSet, testLabels) * 100
print ( "正确率为:%f%%" % test_accurcy)
if __name__ == '__main__' :
colicSklearn( )
'''
正确率为:73.134328%
'''