今天完成了疝马病的预测,结果符合书上预期,原理还是不太懂。有机会问问大神们吧。
实在没有精力贴过程了,贴个代码吧,操作和书上一样。
注意矩阵相乘用numpy的操作,mat。
注意在线算法和离线算法的区别。
这个数据集有数据缺失用0补齐,注意为什么用0,书上有介绍。
这个在数据缺失30%的情况下,这个正确率是可以接受的,改变参数之后能降低错误率20%左右。
代码如下:
#!/usr/bin/python
#coding:utf-8
#logistic.py
from numpy import *
import numpy
import operator
#读取数据函数
def loadDataSet():
datamat = []
lavels = []
f = open('testSet.txt')
for line in f.readlines():
line = line.strip().split()
datamat.append([1.0, float(line[0]), float(line[1])])
lavels.append(int(line[2]))
return datamat, lavels
#阶跃函数
def sigmoid(inX):
return 1.0 / (1 + exp(-inX))
#logistic回归梯度上升算法
def gradUp(dataset, lavels):
datamat = numpy.mat(dataset)
lavelsmat = numpy.mat(lavels).transpose()
m, n = datamat.shape
weights = ones((n, 1))
alpha = 0.001
cycles = 500
for i in range(cycles):
h = sigmoid(datamat * weights)
error = (lavelsmat - h)
weights = weights + alpha * datamat.transpose() * error
return numpy.array(weights)
#logistic随机梯度上升算法(梯度上升算法改进)
def newGradUp(data, lavels, cycles = 150):
dataset = numpy.array(data)
m, n = numpy.shape(dataset)
weights = ones(n)
for i in range(cycles):
dataindex = range(m)
for j in range(m):
x = int(random.uniform(0, len(dataindex)))
#这个地方的alpha不知道如何选择
alpha = 4.0 / (1.0 + i + j) + 0.01
h = sigmoid(sum(dataset[dataindex[x]] * weights))
error = lavels[dataindex[x]] - h
weights = weights + alpha * error * dataset[dataindex[x]]
del(dataindex[x])
return weights
#画出最佳拟合直线
def plotBestFit(weights):
import matplotlib.pyplot as plt
datamat, lavels = loadDataSet()
dataarr = numpy.array(datamat)
n = dataarr.shape[0]
x1 = []
x0 = []
y1 = []
y0 = []
for i in range(n):
if lavels[i] == 1:
x1.append(dataarr[i][1])
y1.append(dataarr[i][2])
else:
x0.append(dataarr[i][1])
y0.append(dataarr[i][2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(x1, y1, s = 30, c = 'red', marker = 's')
ax.scatter(x0, y0, s = 30, c = 'green')
x = arange(-3.0, 3.0, 0.1)
y = (-weights[0] - weights[1] * x) / weights[2]
ax.plot(x, y)
plt.xlabel('X1')
plt.ylabel('X2')
plt.show()
#分类函数
def logisticType(inX, weights):
p = sigmoid(sum(inX * weights))
if p > 0.5:
return 1.0
else:
return 0.0
#测试函数
def testLogistic():
ftrain = open('horseColicTraining.txt')
ftest = open('horseColicTest.txt')
trainset = []
trainlavels = []
for i in ftrain.readlines():
line = i.strip().split('\t')
linearr = []
for i in range(21):
linearr.append(float(line[i]))
trainset.append(linearr)
trainlavels.append(float(line[21]))
trainwei = newGradUp(numpy.array(trainset), trainlavels, 500)
errorcount = 0
numtest = 0
for linet in ftest.readlines():
numtest += 1
linetest = linet.strip().split('\t')
larr = []
for i in range(21):
larr.append(float(linetest[i]))
if int(logisticType(larr, trainwei)) != int(linetest[21]):
errorcount += 1
errorrate = (float(errorcount) / numtest)
print "error rate is %f"%errorrate
return errorrate
#计算平均错误率
def aveErrorRate():
numtest = 10
errorsum = 0
for i in range(numtest):
errorsum += testLogistic()
print "the average error rate is %f"%(float(errorsum) / numtest)