1、欧氏距离公式,并排序
diffMat = tile(X, (dataSize, 1)) -dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis = 1)
distances = sqDistances**0.5
SortedDistIndicies = distances.argsort()
2、矩阵元素归一化:
minValue = dataSet.min(0)
maxValue = dataSet.max(0)
range = maxValue - minValue
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - tile(minValue, (m, 1))
normDataSet = normDataSet/tile(range, (m, 1))
3、计算信息熵
信息熵:
def calcShannonEnt(dataSet):
numEntries = len(dataSet)
labelCounts = {}
for featVec in dataSet:
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys():
labelCount[currentLabel] = 0
labelCount[currentLabel] += 1
shannonEnt = 0.0
for key in labelCounts:
prod = float(labelCounts[key])/numEntries
shannonEnt -= prob * log(prod, 2)
return shannonEnt
4、梯度上升优化算法
def gradAscent(dataMatIn, classLabels):
dataMatrix = mat(dataMatIn)
labelMat = mat(classLabels).transpose()
m,n = shape(dataMatrix)
alpha = 0.001
maxCycles = 500
weight = ones((n, 1))
for k in range(maxCycles):
h = sigmoid(dataMatric * weights)
error = (labelMat - h)
weights = weights + alpha * dataMatrix.transpose() * error
return weights
5、随机梯度上升算法
不设置迭代次数,每次用一个样本点进行更新
def stocGradAscent0(dataMatrix, classLabels):
m, n = shap(dataMatrix)
alpha = 0.01
weights = ones(n)
for i in range(m):
h = sigmoid(sum(dataMatrix[i]*weights))
error = classLabels[i] - h
weights = weights + alpha * error * dataMatrix[i]
return weights
6、改进的梯度上升算法
改进:1、alpha每次迭代都会调整,缓解数据波动或者高频波动
2、在--2处,通过随机取样来更新回归参数,然后删除该值,减少周期波动性。
def stocGradAscent1(dataMatrix, classLabels, numIter=150):
m, n = shape(dataMatrix)
weight = ones(n)
for j in range(m):
dataIndex = range(m)
for i in range(m):
alpha = 4/(1.0 + i + j) +0.01
randIndex = int(random.uniform(0, len(dataIndex))) ---2
h = sigmoid(sum(dataMatrix[randIndex]*weights))
error = classLabels[randIndex] - h
weights = weights + alpha * error *dataMatrix[randIndex]
del(dataIndex[randIndex]) ---2
return weights