# 机器学习实战——Logistic回归

《机器需学习实战》中的代码如下:

# -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""

import numpy as np
import matplotlib.pyplot as plt

def sigmiod(x):
'''
Logistic回归模型的Sigmiod函数
'''
function = 1.0/(1+np.exp(-x))
return function

'''
这是进行加载数据的函数
strip()是删除\t,\n,\r,' '的方法
'''
dataMat = []
labelMat = []
lineArr = line.strip().split()
dataMat.append([1.0,float(lineArr[0]),float(lineArr[1])])
labelMat.append(int(lineArr[2]))
return dataMat,labelMat

"""
这是梯度上升算法的函数
alpha梯度上升算法中的学习系数
maxCycles进行迭代的最大次数
"""

'''
把数据转换成Numpy矩阵
'''
dataMatrix = np.mat(dataMatIn)
labelMat = np.mat(classLabels).transpose()

m,n = np.shape(dataMatrix)          #m,n是测试数据的行数与列数
weights = np.ones((n,1))
for k in range(maxCycles):
h = sigmiod(dataMatrix*weights)
error = (labelMat-h)
weights = weights + alpha *dataMatrix.transpose()*error
return weights

m,n = np.shape(dataMatIn)
weights = np.ones(n)
for j in range(maxCycles):
dataIndex = np.arange(m)
for i in range(m):
alpha = 4/(1.0+i+j)+0.001
randomIndex = int(np.random.uniform(0,len(dataIndex)))
h = sigmiod(sum(dataMatIn[randomIndex]*weights))
error = classLabels[randomIndex] - h
weights = weights + alpha * error * dataMatIn[randomIndex]
np.delete(dataIndex,dataIndex[randomIndex])
return weights

def plotBestFit(weights):
'''
这是数据可视化的函数
画最佳拟合直线
'''

print("最佳系数为:")
print(weights)

dataArr = np.mat(dataMat)
n = np.shape(dataArr)[0]

"""
x1,y1存放分类为1的数据
x2,y2存放分类为0的数据
"""

x1 = []
y1 = []
x2 = []
y2 = []
for i in range(n):
if int(labelMat[i]) == 1:
x1.append(dataArr[i,1])
y1.append(dataArr[i,2])
else:
x2.append(dataArr[i,1])
y2.append(dataArr[i,2])

fig = plt.figure()
ax.scatter(x1,y1,s = 30,c='red',marker = 's')
ax.scatter(x2,y2,s = 30,c='green')
x = np.arange(-3.0,3.0,0.1)
y = (-weights[0]-weights[1]*x)/weights[2]
ax.plot(x,y,c='blue')
plt.xlabel("X1")
plt.ylabel("X2")
plt.show()

def run_main():
'''
这是主函数
'''

"""
下面是梯度上升算法得到的分类
"""
plotBestFit(weights1.getA())

"""
下面是随机梯度上升算法得到的分类
"""
plotBestFit(weights2)

if __name__ == '__main__':
run_main()


# -*- coding: utf-8 -*-
"""
Created on Fri Mar  3 14:15:43 2017

"""

import numpy as np
import matplotlib.pyplot as plt

def sigmoid(x):
'''
Sigmiod函数
'''
function = 1.0/(1+np.exp(-x))
return function

'''
导入西瓜数据
'''
file = 'D:\\Program Files (x86)\\机器学习\\周志华机器学习\\WatermelonDataSet.txt'
ftrain = open(file)
trainingset = []
labelset = []
LineArr = line.strip().split(',')
trainingset.append([1.0,float(LineArr[7]),float(LineArr[8])])
if ("是" == LineArr[9]):
labelset.append(1.0)
else:
labelset.append(0.0)
return trainingset, labelset

'''
随机梯度下降算法函数
alpha是学习速率，maxcircle是迭代次数
trainingset是训练数据集,labelset是数据对应的标记集
'''
row,col = np.shape(trainingset)
weights = np.ones(col)
for j in range(maxcircle):
DataIndex = np.arange(row)
for i in range(row):
alpha = 4.0/(i+j+1.0) + 0.01
randomindex = int(np.random.uniform(0,len(DataIndex)))
h = sigmoid(sum(trainingset[randomindex]*weights))
error = labelset[randomindex] - h
weights = weights + alpha * trainingset[randomindex]*error
np.delete(DataIndex,DataIndex[randomindex])
return weights

def plotBestFit(weights):
'''
这是数据可视化的函数
画最佳拟合直线
'''

dataArr = np.mat(trainingset)
n = np.shape(dataArr)[0]

"""
x1,y1存放分类为1的数据
x2,y2存放分类为0的数据
"""

x1 = []
y1 = []
x2 = []
y2 = []
for i in range(n):
if int(labelset[i]) == 1:
x1.append(dataArr[i,1])
y1.append(dataArr[i,2])
else:
x2.append(dataArr[i,1])
y2.append(dataArr[i,2])

fig = plt.figure()
ax.scatter(x1,y1,s = 30,c='red',marker = 's')
ax.scatter(x2,y2,s = 30,c='green')
x = np.arange(0.0,1.0,0.1)
y = (-weights[0]-weights[1]*x)/weights[2]
ax.plot(x,y,c='blue')
plt.xlabel("X1")
plt.ylabel("X2")
plt.show()

def ClassifyVector(X,weights):
"""
判断分类的函数
"""
result = sigmoid(sum(X*weights))
flag = 0;
if result > 0.5:
flag = 1;
else:
flag = 0;
return flag

def ErrorRate(trainingset,weights,labelset,maxcircle):
errorcount = 0
n = np.shape(trainingset)[0]
for i in range(n):
if ClassifyVector(np.array(trainingset[i]),weights) != labelset[i]:
errorcount = errorcount + 1
errorrate = errorcount*1.0/n
return errorrate

"""
这是梯度上升算法的函数
alpha梯度上升算法中的学习系数
maxCycles进行迭代的最大次数
"""

'''
把数据转换成Numpy矩阵
'''
dataMatrix = np.mat(dataMatIn)
labelMat = np.mat(classLabels).transpose()

m,n = np.shape(dataMatrix)          #m,n是测试数据的行数与列数
weights = np.ones((n,1))
for k in range(maxCycles):
h = sigmoid(dataMatrix*weights)
error = (labelMat-h)
weights = weights + alpha *dataMatrix.transpose()*error
return weights

def run_main():
"""
这是主函数
"""
maxcircle1 = np.arange(150,300,5)
maxcircle2 = np.arange(500,580,5)

print("以下是应用随机梯度上升算法的分类")
for i in maxcircle1:
print("最佳系数为:")
print(weights)
errorrate = ErrorRate(trainingset,weights,labelset,i)
print("迭代次数为%d时,错误率为:%f" %(i,errorrate))
plotBestFit(weights)

print("以下是应用梯度上升算法的分类")
for i in maxcircle2:
print("最佳系数为:")
print(weights)
errorrate = ErrorRate(trainingset,weights,labelset,i)
print("迭代次数为%d时,错误率为:%f" %(i,errorrate))
plotBestFit(weights.getA())

if __name__ == '__main__':
run_main()

• 评论

• 上一篇
• 下一篇