KNN
from sklearn.model_selection import train_test_split
from sklearn import datasets
import numpy as np
from math import sqrt
from collections import Counter
class KNNClassifier:
def __init__(self, k):
# 初始化
assert k >= 1
self.k = k
self._X_train = None
self._y_train = None
def fit(self, X_train, y_train):
assert X_train.shape[0] == y_train.shape[0]
assert self.k <= X_train.shape[0]
self._X_train = X_train
self._y_train = y_train
return self
def predict(self, X_predict):
# 给定待预测数据集X_predict,返回表示X_predict的结果向量
assert self._X_train is not None and self._y_train is not None
assert X_predict.shape[1] == self._X_train.shape[1]
y_predict = [self._predict(x) for x in X_predict]
return np.array(y_predict)
def _predict(self, x):
# 给定单个待预测数据x,返回x的预测结果值
assert x.shape[0] == self._X_train.shape[1]
distances = [sqrt(np.sum((x_train - x) ** 2))
for x_train in self._X_train]
nearest = np.argsort(distances)
topK_y = [self._y_train[i] for i in nearest[:self.k]]
votes = Counter(topK_y)
return votes.most_common(1)[0][0]
def __repr__(self):
return "KNN(k=%d)" % self.k
# 导入MNIST数据
digits = datasets.load_digits()
X=digits.data
y=digits.target
X_train, X_test, y_train, y_test = train_test_split(X,y)
knn_clf=KNNClassifier(k=3)
knn_clf.fit(X_train,y_train)
y_predict=knn_clf.predict(X_test)
print(sum(y_predict==y_test)/len(y_test))
PCA+KNN
from numpy import *
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import time
def loadDataSet(filedir):
DateSet = pd.read_csv(filedir)
len_data = len(DateSet)
Data = DateSet.iloc[1:int(len_data), 1:].as_matrix() # len_data 可以自己设置len_data的大小来测试算法对数据量的敏感程度
# PCA降维
pca = PCA(n_components=25, whiten=True)
Data = pca.fit_transform(Data)
# print('Data size: ', shape(Data))
label = DateSet.iloc[1:int(len_data), :1].as_matrix()
# 分配训练数据和测试数据
train_data, test_data, train_label, test_label = train_test_split(Data, label, test_size=0.3, random_state=0,shuffle=True)
return train_data, test_data, train_label, test_label
def kNN_classify(classify_data, train_data, train_label, k):
# 计算欧式距离
m = shape(train_data)[0]
diff = tile(classify_data, (m, 1)) - train_data
dist = sqrt(sum(diff ** 2, axis=1))
data_index = argsort(dist) # 升序排序,得到索引值
# 取前k个与classify_data距离最小的训练数据的标签
data_label = []
for i in range(k):
data_label.append(train_label[data_index[i]][0])
# 选取数量最多的标签作为classify_data的分类标签
classify_label = {}
for j in range(k):
votelabel = data_label[j]
classify_label[votelabel] = classify_label.get(votelabel, 0) + 1
# print('classify_label: ', classify_label)
maxValue = 0
class_label = None
for label, value in classify_label.items():
if value > maxValue:
maxValue = value
class_label = label
return class_label
def train_model(train_data, test_data, train_label, test_label, k=10):
m = shape(test_data)[0]
err_num = 0
for i in range(m):
result = kNN_classify(test_data[i], train_data, train_label, k)
if result != test_label[i]:
err_num += 1
accuracy = (m - err_num) / m
return accuracy
path="D:\\STUDYFILE\\RUN\\mnist_train.csv"
train_data, test_data, train_label, test_label = loadDataSet(path)
accuracy = train_model(train_data, test_data, train_label, test_label, k=10)
print(accuracy)
这里用到了.csv格式文件,下面是原始文件转换的代码。
def convert(imgf, labelf, outf, n):
f = open(imgf, "rb")
o = open(outf, "w")
l = open(labelf, "rb")
f.read(16)
l.read(8)
images = []
for i in range(n):
image = [ord(l.read(1))]
for j in range(28 * 28):
image.append(ord(f.read(1)))
images.append(image)
for image in images:
o.write(",".join(str(pix) for pix in image) + "\n")
f.close()
o.close()
l.close()
convert("MNIST/train-images.idx3-ubyte", "MNIST/train-labels.idx1-ubyte",
"mnist_train.csv", 60000)
convert("MNIST/t10k-images.idx3-ubyte", "MNIST/t10k-labels.idx1-ubyte",
"mnist_test.csv", 10000)
print("Convert Finished!")