Experiment with k Nearest Neighbor
import numpy as np
import matplotlib.pyplot as plt
def getDataSet(filename):
fr = open(filename)
lines = fr.readlines()
dataSet = []
for line in lines:
line = line.strip('\n')
listFromLine = line.split()
dataSet.append(listFromLine)
dataSet = np.asarray(dataSet, dtype='float64')
return dataSet
# test getDataSet()
trainDataSet = getDataSet('hw4_train.dat.txt')
print("trainDataSet's shape is ", trainDataSet.shape)
print("trainDataSet[:5] is\n", trainDataSet[:5])
testDataSet = getDataSet('hw4_test.dat.txt')
print("testDataSet's shape is ", testDataSet.shape)
print("testDataSet[:5] is\n", testDataSet[:5])
def KNN(trainDataSet, data, k):
dataSetSize = trainDataSet.shape[0]
diff = np.tile(data[:-1], (dataSetSize, 1)) - trainDataSet[:, :-1]
sqDiff = diff ** 2
sqDistances = sqDiff.sum(axis=1)
distance = sqDistances ** 0.5
sortedDistIndicies = distance.argsort()
classCount = 0.0
for i in range(k):
classCount += trainDataSet[sortedDistIndicies[i]][-1]
if classCount > 0:
return 1
else:
return -1
def cntErrRate(trainDataSet, dataSet, k):
N = dataSet.shape[0]
errCnt = 0.0
for i in range(N):
preLabel = KNN(trainDataSet, dataSet[i], k)
if preLabel != dataSet[i][-1]:
errCnt += 1
return errCnt / float(N)
11 & 12, 计算Ein和Eout
einList = []
eoutList = []
K = [1, 3, 5, 7, 9]
for k in K:
einList.append(cntErrRate(trainDataSet, trainDataSet, k))
eoutList.append(cntErrRate(trainDataSet, testDataSet, k))
# plot Ein(gk-nbor) and Eout(gk-nbor)
plt.plot(K, einList, label='Ein(gk-nbor)')
plt.plot(K, eoutList, label='Eout(gk-nbor)')
plt.xlabel('k')
plt.ylabel('error rate')
plt.legend()
13&14
# 13
def gUniform(trainDataSet, data, gamma):
dataSetSize = trainDataSet.shape[0]
diff = np.tile(data[:-1], (dataSetSize, 1)) - trainDataSet[:, :-1]
sqDiff = diff ** 2
sqDistances = sqDiff.sum(axis=1)
exp = np.exp(-gamma * sqDistances)
yTimeExp = trainDataSet[:, -1] * exp
sum = yTimeExp.sum()
if sum > 0:
return 1
else:
return -1
def gUniErrCnt(trainDataSet, dataSet, gamma):
N = dataSet.shape[0]
errCnt = 0
for i in range(N):
preLabel = gUniform(trainDataSet, dataSet[i], gamma)
if preLabel != dataSet[i][-1]:
errCnt += 1
errRate = errCnt / float(N)
return errRate
gammaList = [0.001, 0.1, 1, 10, 100]
log10GammaList = np.log10(gammaList)
EinGUniform = []
EoutGUniform = []
for gamma in gammaList:
EinGUniform.append(gUniErrCnt(trainDataSet, trainDataSet, gamma))
EoutGUniform.append(gUniErrCnt(trainDataSet, testDataSet, gamma))
# plot Ein(g-uniform) and Eout(g-uniform)
plt.plot(log10GammaList, EinGUniform, label="Ein(g-uniform)")
plt.plot(log10GammaList, EoutGUniform, label="Eout(g-uniform)")
plt.xlabel("log10 gamma")
plt.ylabel("error rate")
plt.legend()
Experiment with k-Means
trainDataSet = getDataSet('hw4_nolabel_train.dat.txt')
def updBelong(dataSet, centers, k):
belongs = []
N = dataSet.shape[0]
for i in range(N):
closestIndex, sqDistance = cntMinDistance(dataSet[i], centers, k)
belongs.append(closestIndex)
return np.array(belongs)
def cntMinDistance(data, centers, k):
diff = np.tile(data, (centers.shape[0], 1)) - centers
sqDiff = diff ** 2
sqDistance = sqDiff.sum(axis=1)
miniIndex = sqDistance.argmin()
return miniIndex, sqDistance
def updCenter(dataSet, belongs, centers, k):
N = dataSet.shape[0]
cnt = np.zeros(k)
newCenterDict = {}
newCenters = np.zeros(centers.shape)
for i in range(N):
if belongs[i] not in newCenterDict.keys():
newCenterDict[belongs[i]] = 1
else:
newCenterDict[belongs[i]] += 1
for key in newCenterDict.keys():
for i in range(N):
if belongs[i] == key:
newCenters[key] += dataSet[i]
newCenters[key] /= float(newCenterDict[key])
return newCenters
def cmpCenter(arr1, arr2):
m, n = arr1.shape
for i in range(m):
for j in range(n):
if arr1[i][j] != arr2[i][j]:
return True
return False
def kMeans(dataSet, k):
N = dataSet.shape[0]
index = np.random.randint(0, N, k)
centers = dataSet[index] # center存放簇心的坐标
belongs = updBelong(dataSet, centers, k) # 每个元素属于哪个簇心
newCenters = updCenter(dataSet, belongs, centers, k) # 更新簇心
# 测试用,防止死循环
flag = 0
# 检测收敛轮数
num = 0
while cmpCenter(newCenters, centers):
flag += 1
num += 1
belongs = updBelong(dataSet, newCenters, k)
centers = newCenters
newCenters = updCenter(dataSet, belongs, centers, k)
if flag == 1000:
break
return centers, belongs, num
# test kMeans()
finalCenters, finalBelongs, num = kMeans(trainDataSet, 3)
print("finalCenters\n", finalCenters)
print("finalBelongs\n", finalBelongs)
print("num: ", num)
def errorCnt(trainDataSet, centers, belongs):
err = 0
uniqueBelongs = list(set(belongs))
N = trainDataSet.shape[0]
for key in uniqueBelongs:
for i in range(N):
if belongs[i] == key:
diff = trainDataSet[i] - centers[key]
sqDiff = diff ** 2
sqDistance = sqDiff.sum()
err += sqDistance
return err / N
def cntEinVar(einList, average):
sqDiffSum = 0
N = len(einList)
for i in range(N):
diff = einList[i] - average
sqDiff = diff ** 2
sqDiffSum += sqDiff
einVar = sqDiffSum / float(N)
return einVar
einList = []
einAverList = []
einVarList = []
kList = [2, 4, 6, 8, 10]
T = 500
for k in kList:
einSum = 0
for t in range(T):
finalCenters, finalBelongs, num = kMeans(trainDataSet, k)
ein = errorCnt(trainDataSet, finalCenters, finalBelongs)
einList.append(ein)
einSum += ein
einAver = einSum / float(T)
einVar = cntEinVar(einList, einAver)
einAverList.append(einAver)
einVarList.append(einVar)
# plot average of Ein over 500 experiments for k = 2, 4, 6, 8, 10
plt.plot(kList, einAverList)
plt.xlabel("k")
plt.ylabel("err")
plt.title("average of Ein over 500 experiments for k = 2, 4, 6, 8, 10")
由上图可知,当簇数增加时,平均误差在减小.
# plot variance of Ein over 500 experiments for k = 2, 4, 6, 8, 10
plt.plot(kList, einVarList)
plt.xlabel("k")
plt.ylabel("err")
plt.title("variance of Ein over 500 experiments for k = 2, 4, 6, 8, 10")
由上图可知,当簇数增加时,误差的方差在增大。