将所有代码都放在play_ML文件夹
1、首先在play_ML文件夹中创建一个__init__.py空文件
2、在play_ML文件夹中创建一个kNN.py文件,文件的内容是对kNN算法的实现代码
kNN算法的实现步骤:
1) 将训练集的数据集X_train进行遍历
2) 遍历后的结果与要预测的样本进行求距离运算
3) 将所求得的距离放入一个空列表中
4) 使用np.argsort()函数对距离列表进行有小到大的排序,得到排序后的索引
5) 然后从y_train中取出这些索引对应的标签值(label)
6) 用collections里的Counter函数进统计
7) 数据较的那个类别就为预测样本的类别
kNN.py文件里的代码
"""coding:utf-8"""
import numpy as np
from collections import Counter
from math import sqrt
class KNNClassifier(object):
def __init__(self,k):
"""初始化kNN分类器"""
assert k >= 1,"k must be valid"
self.k = k
self._X_train = None
self._y_train = None
def fit(self,X_train,y_train):
"""根据训练数据集X_train和y_train训练kNN分类器"""
self._X_train = X_train
self._y_train = y_train
return self
def predict(self,X_predict):
"""给定待预测数据集X_predict,返回表示X_predict的结果向量"""
assert self._X_train is not None and self._y_train is not None,\
"must fif before predict"
assert X_predict.shape[1] == self._X_train.shape[1],\
"the feature number of X_predict must be equal to X_train"
y_predict = [self._predict(x) for x in X_predict]
return np.array(y_predict)
def _predict(self,x):
# distance = []
distance = [sqrt(np.sum((x_train - x) ** 2))
for x_train in self._X_train]
# for x_train in self._X_train:
# d = sqrt(np.sum(x_train-x)**2)
# distance.append(d)
nearest = np.argsort(distance)
topK_y = [self._y_train[i] for i in nearest[:self.k]]
votes = Counter(topK_y)
return votes.most_common(1)[0][0]
def __repr__(self):
return "KNN(k=%d)" % self.k
3、在play_ML文件夹中创建一个model_selection.py文件,文件的内容是对将原始数据集拆分成训练数据集和测试数据集的实现代码
将原始数据 X 和 y拆分成训练数据集和测试数据集的实现步骤:
1) 把样本长度个数用np.random.permutation进行一下随机排列得到shuffled_indexes
2) 求出测试集的样本个数
3) 从 shuffled_indexes中取出与测试集样本个数和训练集样本个数相同个数 作为训练数据和测试数据的索引
4) 根据索引从原始数据集中得到相应的样本
model_selection.py文件里的代码
"""coding:utf-8"""
import numpy as np
def train_test_split(X,y,test_ratio=0.2,seed=None):
"""将数据 X 和 y 按照test_ratio分割成X_train, X_test, y_train, y_test"""
assert X.shape[0] == y.shape[0], \
"the size of X must be equal to the size of y"
assert 0.0 <= test_ratio <= 1.0, \
"test_ration must be valid"
if seed:
np.random.seed(seed)
#随机排列一个序列,返回一个排列的序列
#np.random.permutation(10)
#array([1, 7, 4, 3, 0, 9, 2, 5, 8, 6])
shuffled_indexes = np.random.permutation(len(X))
test_size = int(len(X)*test_ratio)
test_index = shuffled_indexes[:test_size]
train_index = shuffled_indexes[test_size:]
X_train = X[train_index]
y_train = y[train_index]
X_test = X[test_index]
y_test = y[test_index]
return X_train,X_test,y_train,y_test
4、在play_ML文件夹中创建一个metrics.py文件,文件的内容是输出一下预测结果的准确率
预测结果的准确率
预测值与真实值相等的个数比上真实值的个数
metrics.py文件里的代码
"""coding:utf-8"""
import numpy as np
def accuracy_score(y_test,y_predict):
"""计算y_test和y_predict之间的准确率"""
assert y_test.shape[0] == y_predict[0],\
"the size of y_true must be equal to the size of y_predict"
return np.sum(y_predict == y_test)/len(y_test)
二、用sklearn中的datasets模块里的数据进行测试
"""coding:utf-8"""
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
import matplotlib
digits = datasets.load_digits()
X = digits.data
print(X.shape)
y = digits.target
print(y[:100])
print(X[:3])
some_digits = X[666]
some_digits_image = some_digits.reshape(8,8)
plt.imshow(some_digits_image,cmap = matplotlib.cm.binary)
plt.show()
from play_ML.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y)
from play_ML.kNN import KNNClassifier
my_knn_clf = KNNClassifier(k=3)
my_knn_clf.fit(X_train,y_train)
y_predict = my_knn_clf.predict(X_test)
from play_ML.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_predict)
print(accuracy)