参考资料:
机器学习实战
'''
@version: 0.0.1
@Author: tqsr
@dev: python3 vscode
@Date: 2019-10-29 23:39:44
@LastEditTime: 2019-10-30 22:00:01
@FilePath: \\机器学习实战\\05-Logistic回归\\logRegres.py
@Descripttion: Logistic回归主要是根据现有数据对分类边界线建立回归公式,进行分类
'''
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
def loadDataSet():
"""
[summary]:加载数据
Returns:
dataMat -- 数据列表
labelMat -- 标签列表
"""
dataMat = []
labelMat = []
test_path = r'05-Logistic回归\testSet.txt'
fp = open(test_path)
for line in fp.readlines():
lineArr = line.strip().split()
dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])])
labelMat.append(int(lineArr[2]))
fp.close()
return dataMat, labelMat
def sigmoid(inX):
return 1.0 / (1 + np.exp(-inX))
def Gradient_Ascent_test():
"""
[summary]:梯度上升算法测试函数
求f(x) = -x^2+4x的极大值
Returns:
x_new -- 函数极大值
"""
def f_prime(x_old):
return -2 * x_old + 4
x_old = -1
x_new = 0
alpha = 0.01
pression = 0.00000001
while abs(x_new - x_old) > pression:
x_old = x_new
x_new = x_old + alpha * f_prime(x_old)
print(x_new)
def gradAscent(dataMatIn, classLabels):
"""
[summary]:梯度上升
每个回归系数初始化为1
重复R次:
计算整个数据集的梯度
使用alpha × gradient更新回归系数的向量
返回回归系数
Arguments:
dataMatIn {[type]} -- 数据集
classLabels {[type]} -- 数据标签
Returns:
weights.getA() -- # 将矩阵转换为数组,返回权重数组
weights_array -- 每次更新的回归系数
"""
dataMatrix = np.mat(dataMatIn)
labelMat = np.mat(classLabels).transpose()
m, n = np.shape(dataMatrix)
alpha = 0.001
maxCycles = 500
weights = np.ones((n, 1))
weights_array = np.array([])
for k in range(maxCycles):
h = sigmoid(dataMatrix * weights)
error = (labelMat - h)
weights = weights + alpha * dataMatrix.transpose() * error
weights_array = np.append(weights_array, weights)
weights_array = weights_array.reshape(maxCycles, n)
return weights.getA(), weights_array
def plotBestFit(weights):
"""
[summary]:画出决策边界
Arguments:
wei {[type]} -- [description]
"""
dataMat, labelMat = loadDataSet()
dataArr = np.array(dataMat)
n = np.shape(dataArr)[0]
xcord1 = []
ycord1 = []
xcord2 = []
ycord2 = []
for i in range(n):
if int(labelMat[i]) == 1:
xcord1.append(dataArr[i, 1])
ycord1.append(dataArr[i, 2])
else:
xcord2.append(dataArr[i, 1])
ycord2.append(dataArr[i, 2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
ax.scatter(xcord2, ycord2, s=30, c='green')
x = np.arange(-3.0, 3.0, 0.1)
Y = (-weights[0] - weights[1] * x) / weights[2]
ax.plot(x, Y)
plt.xlabel('X1')
plt.ylabel('X2')
plt.show()
def stocGradAscent0(dataMatrix, classLabels):
"""
[summary]:随机梯度上升
所有回归系数初始化为1
对数据集中每个样本
计算该样本的梯度
使用alpha × gradient更新回归系数值
返回回归系数值
Arguments:
dataMatrix {[type]} -- [description]
classLabels {[type]} -- [description]
Returns:
[type] -- [description]
"""
m, n = np.shape(dataMatrix)
alpha = 0.01
weights = np.ones(n)
for i in range(m):
h = sigmoid(sum(dataMatrix[i] * weights))
error = classLabels[i] - h
weights = weights + alpha * error * dataMatrix[i]
return weights
def stocGradAscent1(dataMatrix, classLabels, numIter=150):
"""
[summary]:随机梯度上升算法
Arguments:
dataMatrix {[type]} -- 迭代数组
classLabels {[type]} -- 数据标签
Keyword Arguments:
numIter {int} -- 最大迭代次数 (default: {150})
Returns:
weights -- 权重数组
weights_array -- 每次更新的回归系数
"""
m, n = np.shape(dataMatrix)
weights = np.ones(n)
weights_array = np.array([])
for j in range(numIter):
dataIndex = list(range(m))
for i in range(m):
alpha = 4 / (1.0 + j + i) + 0.01
randIndex = int(np.random.uniform(0, len(dataIndex)))
h = sigmoid(sum(dataMatrix[randIndex] *
weights))
error = classLabels[randIndex] - h
weights = weights + alpha * error * dataMatrix[randIndex]
weights_array = np.append(weights_array, weights,
axis=0)
del (dataIndex[randIndex])
weights_array = weights_array.reshape(numIter * m, n)
return weights, weights_array
def plotWeights(weights_array1, weights_array2):
"""
[summary]:绘制回归系数与迭代次数的关系
Arguments:
weights_array1 {[type]} -- 随机梯度上升算法每次更新的回归系数
weights_array2 {[type]} -- 上升算法每次更新的回归系数
"""
font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14)
fig, axs = plt.subplots(nrows=3,
ncols=2,
sharex=False,
sharey=False,
figsize=(20, 10))
x1 = np.arange(0, len(weights_array1), 1)
axs[0][0].plot(x1, weights_array1[:, 0])
axs0_title_text = axs[0][0].set_title(u'随机梯度上升算法:回归系数与迭代次数关系',
FontProperties=font)
axs0_ylabel_text = axs[0][0].set_ylabel(u'W0', FontProperties=font)
plt.setp(axs0_title_text, size=20, weight='bold', color='black')
plt.setp(axs0_ylabel_text, size=20, weight='bold', color='black')
axs[1][0].plot(x1, weights_array1[:, 1])
axs1_ylabel_text = axs[1][0].set_ylabel(u'W1', FontProperties=font)
plt.setp(axs1_ylabel_text, size=20, weight='bold', color='black')
axs[2][0].plot(x1, weights_array1[:, 2])
axs2_xlabel_text = axs[2][0].set_xlabel(u'迭代次数', FontProperties=font)
axs2_ylabel_text = axs[2][0].set_ylabel(u'W2', FontProperties=font)
plt.setp(axs2_xlabel_text, size=20, weight='bold', color='black')
plt.setp(axs2_ylabel_text, size=20, weight='bold', color='black')
x2 = np.arange(0, len(weights_array2), 1)
axs[0][1].plot(x2, weights_array2[:, 0])
axs0_title_text = axs[0][1].set_title(u'梯度上升算法:回归系数与迭代次数关系',
FontProperties=font)
axs0_ylabel_text = axs[0][1].set_ylabel(u'W0', FontProperties=font)
plt.setp(axs0_title_text, size=20, weight='bold', color='black')
plt.setp(axs0_ylabel_text, size=20, weight='bold', color='black')
axs[1][1].plot(x2, weights_array2[:, 1])
axs1_ylabel_text = axs[1][1].set_ylabel(u'W1', FontProperties=font)
plt.setp(axs1_ylabel_text, size=20, weight='bold', color='black')
axs[2][1].plot(x2, weights_array2[:, 2])
axs2_xlabel_text = axs[2][1].set_xlabel(u'迭代次数', FontProperties=font)
axs2_ylabel_text = axs[2][1].set_ylabel(u'W2', FontProperties=font)
plt.setp(axs2_xlabel_text, size=20, weight='bold', color='black')
plt.setp(axs2_ylabel_text, size=20, weight='bold', color='black')
plt.show()
def classifyVector(inX, weights):
prob = sigmoid(sum(inX * weights))
return 1.0 if prob > 0.5 else 0.0
def colicTest():
frTest = open(r'05-Logistic回归\horseColicTest.txt', 'r', encoding='utf-8')
frTrain = open(r'05-Logistic回归\horseColicTraining.txt',
'r',
encoding='utf-8')
featuresNumbers = 21
trainingSet = []
trainingLabels = []
for line in frTrain.readlines():
currLine = line.strip().split('\t')
lineArr = []
for i in range(featuresNumbers):
lineArr.append(float(currLine[i]))
trainingSet.append(lineArr)
trainingLabels.append(float(currLine[featuresNumbers]))
trainWeights = stocGradAscent1(np.array(trainingSet), trainingLabels,
1000)
errorCount = 0
numTestVec = 0.0
for line in frTest.readlines():
numTestVec += 1.0
currLine = line.strip().split('\t')
lineArr = []
for i in range(featuresNumbers):
lineArr.append(float(currLine[i]))
if int(classifyVector(np.array(lineArr), trainWeights)) != int(
currLine[featuresNumbers]):
errorCount += 1
errorRate = (float(errorCount) / numTestVec) * 100
print("这个测试集的错误率: %.2f%%" % errorRate)
return errorRate
def multiTest():
numTests = 10
errorSum = 0.0
for k in range(numTests):
errorSum += colicTest()
print("在 %d 迭代后,平均错误率: %f" % (numTests, errorSum / float(numTests)))
if __name__ == '__main__':
dataMat, labelMat = loadDataSet()
weights1, weights_array1 = stocGradAscent1(np.array(dataMat), labelMat)
weights2, weights_array2 = gradAscent(dataMat, labelMat)
plotWeights(weights_array1, weights_array2)
用sklearn预测病马死亡率
from sklearn.linear_model import LogisticRegression
def colicSklearn():
frTest = open(r'05-Logistic回归\horseColicTest.txt', 'r', encoding='utf-8')
frTrain = open(r'05-Logistic回归\horseColicTraining.txt',
'r',
encoding='utf-8')
trainingSet = []
trainingLabels = []
testSet = []
testLabels = []
for line in frTrain.readlines():
currLine = line.strip().split('\t')
lineArr = []
for i in range(len(currLine) - 1):
lineArr.append(float(currLine[i]))
trainingSet.append(lineArr)
trainingLabels.append(float(currLine[-1]))
for line in frTest.readlines():
currLine = line.strip().split('\t')
lineArr = []
for i in range(len(currLine) - 1):
lineArr.append(float(currLine[i]))
testSet.append(lineArr)
testLabels.append(float(currLine[-1]))
classifier = LogisticRegression(solver='liblinear',
max_iter=10).fit(trainingSet,
trainingLabels)
test_accurcy = classifier.score(testSet, testLabels) * 100
print('正确率:%f%%' % test_accurcy)
if __name__ == '__main__':
colicSklearn()