### LogReg.py:
# -*- coding: utf-8 -*-
# author: Xin Chen
# *****************************************************
# 参考:
# http://blog.csdn.net/zouxy09/article/details/20319673
# https://zhuanlan.zhihu.com/p/21627018?refer=uqer2015
# *****************************************************
from numpy import *
import matplotlib.pyplot as plt
import time
# calculte the sigmoid function
def sigmoid(inX):
return 1.0/(1+exp(-inX))
# train a logistic regression model using some optional optimize algorithm
# input: train_x is a mat datatype, each row stands for one sample
# train_y is mat datatype too, each row is the corresponding label
# opts is optimize option include step and maximum number of iterations
def trainLogRegres(train_x, train_y, opts):
# calculate training time
startTime = time.time()
numSamples, numFeatures = shape(train_x)
alpha = opts['alpha'];
maxIter = opts['maxIter']
weights = ones((numFeatures, 1))
# optimize through gradient descent algorilthm
for k in range(maxIter):
if opts['optimizeType'] == 'gradDescent': # gradient descent algorilthm
output = sigmoid(train_x * weights)
error = train_y - output
weights = weights + alpha * train_x.transpose() * error
elif opts['optimizeType'] == 'stocGradDescent': # stochastic gradient descent
for i in range(numSamples):
output = sigmoid(train_x[i, :] * weights)
error = train_y[i, 0] - output
weights = weights + alpha * train_x[i, :].transpose() * error
elif opts['optimizeType'] == 'smoothStocGradDescent': # smooth stochastic gradient descent
# randomly select samples to optimize for reducing cycle fluctuations
dataIndex = range(numSamples)
for i in range(numSamples):
alpha = 4.0 / (1.0 + k + i) + 0.01
randIndex = int(random.uniform(0, len(dataIndex)))
output = sigmoid(train_x[dataIndex[randIndex], :] * weights)
error = train_y[randIndex, 0] - output
weights = weights + alpha * train_x[randIndex, :].transpose() * error
del (dataIndex[randIndex]) # during one interation, delete the optimized sample
elif opts['optimizeType'] == 'newton':
output = sigmoid(train_x * weights)
hessian = train_x.transpose() * (output * (output-1).transpose()) * train_x
gradient = train_x.transpose() * (train_y - output)
weights -= linalg.inv(hessian) * gradient
else:
raise NameError('Not support optimize method type!')
print 'Congratulations, training complete! Took %fs!' % (time.time() - startTime)
return weights
# test your trained Logistic Regression model given test set
def testLogRegres(weights, test_x, test_y):
numSamples, numFeatures = shape(test_x)
matchCount = 0
for i in xrange(numSamples):
predict = sigmoid(test_x[i, :] * weights)[0, 0] > 0.5
if predict == bool(test_y[i, 0]):
matchCount += 1
accuracy = float(matchCount) / numSamples
return accuracy
# show your trained logistic regression model only available with 2-D data
# 只有样本X是二维的时候才适用
def showLogRegres(weights, train_x, train_y, xlabel='X1', ylabel='X2'):
# notice: train_x and train_y is mat datatype
numSamples, numFeatures = shape(train_x)
if numFeatures != 3:
print "Sorry! I can not draw because the dimension of your data is not 2!"
return 1
# draw all samples
'''
for i in xrange(numSamples):
if int(train_y[i, 0]) == 0:
plt.plot(train_x[i, 1], train_x[i, 2], 'or') # 'or'散点、红色
elif int(train_y[i, 0]) == 1:
plt.plot(train_x[i, 1], train_x[i, 2], 'ob')
# draw the classify line
min_x = min(train_x[:, 1])[0, 0]
max_x = max(train_x[:, 1])[0, 0]
weights = mat(weights).getA() # convert mat to array
y_min_x = float(-weights[0] - weights[1] * min_x) / weights[2]
y_max_x = float(-weights[0] - weights[1] * max_x) / weights[2]
plt.plot([min_x, max_x], [y_min_x, y_max_x], '-g')
plt.xlabel('X1');
plt.ylabel('X2')
plt.show()
'''
X0 = []
Y0 = []
X1 = []
Y1 = []
for i in xrange(numSamples):
if int(train_y[i, 0]) == 0:
X0.append(train_x[i, 1])
Y0.append(train_x[i, 2])
elif int(train_y[i, 0]) == 1:
X1.append(train_x[i, 1])
Y1.append(train_x[i, 2])
plt.scatter(X0, Y0, marker="o", c='red')
plt.scatter(X1, Y1, marker="o", c='blue')
min_x = min(train_x[:, 1])[0, 0]
max_x = max(train_x[:, 1])[0, 0]
weights = mat(weights).getA() # convert mat to array
y_min_x = float(-weights[0] - weights[1] * min_x) / weights[2]
y_max_x = float(-weights[0] - weights[1] * max_x) / weights[2]
plt.plot([min_x, max_x], [y_min_x, y_max_x], '-g')
plt.xlabel(xlabel);
plt.ylabel(ylabel)
plt.show()
#### watermelon3alpha.py:
# -*- coding: utf-8 -*-
# author: Xin Chen
import numpy as np
import pandas as pd
import LogReg
# data来自周志华书p89
data = {
"index": range(1, 18),
"density": [0.697, 0.774, 0.634, 0.608, 0.556, 0.403, 0.481, 0.437, 0.666, 0.243, 0.245, 0.343, 0.639, 0.657, 0.360, 0.593, 0.719],
"sugar": [0.460, 0.376, 0.264, 0.318, 0.215, 0.237, 0.149, 0.211, 0.091, 0.267, 0.057, 0.099, 0.161, 0.198, 0.370, 0.042, 0.103],
"label": [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
}
data = pd.DataFrame(data)
data["constant"] = 1.0
X = np.mat(data[["constant", "density", "sugar"]])
Y = np.mat(data["label"]).T
opts = {'alpha': 0.1, 'maxIter': 1000, 'optimizeType': 'stocGradDescent'}
# 'optimizeType': 'gradDescent', 'stocGradDescent', 'smoothStocGradDescent', 'newton'
optimalWeights = LogReg.trainLogRegres(X, Y, opts)
print 'optimalWeights=\n', optimalWeights
accuracy = LogReg.testLogRegres(optimalWeights, X, Y)
print 'The classify accuracy is: %.3f%%' % (accuracy * 100)
LogReg.showLogRegres(optimalWeights, X, Y, xlabel='density', ylabel='sugar')