scikit-learn中的kNN
对于kNN来说,训练集就是模型
机器学习的流程:
训练集 -> 拟合(fit)-> 模型 -> 预测(predict)
# 引入库,名字挺长的,不好记
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import matplotlib.pyplot as plt
# row data 是python中普通的list
row_data_X = [[3.3935, 2.3312],
[3.1101, 1.7815],
[1.3438, 3.3684],
[3.5823, 4.6792],
[2.2804, 2.8670],
[7.4234, 4.6965],
[5.7451, 3.5340],
[9.1722, 2.5111],
[7.7928, 3.4241],
[7.9398, 0.7916]]
# 1:良性肿瘤,0:恶性肿瘤
row_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
# 将数据转化为np
X_train = np.array(row_data_X)
y_train = np.array(row_data_y)
x = np.array([8.0936, 3.3657]) # 给定待测点
kNN_classifier = KNeighborsClassifier(n_neighbors=6)
kNN_classifier.fit(X_train,y_train)
kNN_classifier.predict(x.reshape(1,-1)) # 此处强制要求是二维数组
array([1])
重新封装我们自己的kNN
# 封装上一节的程序
import numpy as np
from collections import Counter
from math import sqrt
class My_kNN_Classifier:
def __init__(self,k):
"""初始化kNN分类器"""
assert k>=1, "k must be valid"
self.k = k
self.X_train = None
self.y_train = None
def fit(self, X_train, y_train):
"""train the classifier with X_train and y_train """
assert X_train.shape[0] == y_train.shape[0], "The size of X_train must be equal to the size of y_train"
assert self.k<= X_train.shape[0], "the size of X_train must be at least k."
self._X_train = X_train
self._y_train = y_train
return self
def predict(self, X_predict):
"""predict the data set X_predict, return the result of pridicting"""
assert self._X_train is not None and self._y_train is not None, \
"must be fit before prediction!"
assert X_predict.shape[1] == self._X_train.shape[1],\
"the feature of X_predict musst be equal to X_train"
y_predict = [ self._predict(x) for x in X_predict ]
return np.array(y_predict)
def _predict(self, x):
"""predict the x """
distances = [sqrt(np.sum((x_train - x)**2)) for x_train in self._X_train ]
nearests = np.argsort(distances)
top_K = [ self._y_train[i] for i in nearests[:self.k]]
votes = Counter(top_K)
return votes.most_common(1)[0][0]
import numpy as np
from collections import Counter
from math import sqrt
X_predict = x.reshape(1,-1)
kNN = My_kNN_Classifier(6)
kNN.fit(X_train, y_train)
kNN.predict(X_predict)
array([1])
train_test_split(训练集测试集分离)
乱序化过程中,X和y是分离的,但是又是一一对应的,所以不能将其分开随机化,会丢失对应关系。
方式一:可以先将X和y合并成一个矩阵,再对矩阵进行随机化处理,处理完再拆分开来。
方式二:对所有元素的M个索引进行乱序处理。
这里采用方式二。
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
# 从skllearn读取数据
iris = datasets.load_iris()
X = iris.data # 数据集,横坐标为样本,纵坐标为特征
y = iris.target # 样本
# 查看数据集大小
print("X: \n",X.shape, '\n y: \n', y.shape)
X:
(150, 4)
y:
(150,)
# 对X进行重新排序
shuffle_index = np.random.permutation(len(X))
# 设置test,train比例
test_radio = 0.2;
test_size = int(len(X)*test_radio)
test_index = shuffle_index[:test_size]
train_index = shuffle_index[test_size:]
# get train dataset and test dataset
X_train = X[train_index]
y_train = y[train_index]
X_test = X[test_index]
y_test = y[test_index]