1.scikit-learn中的kNN模型
scikit-learn中提供了一个KNeighborClassifier类来实现k近邻法分类模型,其原型为:
sklearn.neighbors.KNighborClassifier(n_neighbors=5,weights=’uniform’,algorithm=’auto’,leaf_size=30,p=2,metric=’minkowski’,metric_params=None,n_jobs=1,**kwargs)
参数
- n_neighbors:一个整数,指定k值。
weights:一字符串或者可调用对象,指定投票权重类型。也就是说这些邻居投票权可以为相同或不同:
- ‘uniform’:本节点的所有邻居节点的投票权重都相等;
- ‘distance’:本节点的所有邻居节点的投票权重与距离成反比,即越近的节点,其投票的权重越大;
- [callable]:一个可调用对象。它传入距离的数组,返回同样形状的权重数组。
algorithm:一个字符串,指定计算最近邻的算法,可以为如下:
- ’ball_tree’ :使用BallTree算法,也就是球树;
- ‘kd_tree’: 使用KDTree算法;
- ‘brute’ : 使用暴力搜素法;
- ‘auto’ : 自动决定最适合的算法。
- leaf_size:一个整数,指定BallTree或者KDTree叶节点的规模。它影响树的构建和查询速度。
- metric:一个字符串,指定距离度量。默认为‘minkowski’距离。
- p:整数值,指定在‘minkowski’距离上的指数。
- n_jobs:并行性。默认为-1表示派发任务到所有计算机的CPU上。
方法
- fit(X,y):训练模型
- predict:使用模型来预测,返回待预测样本的标记。
- score(X,y):返回在(X,y)上预测的准确率。
- predict_proba(X):返回样本为每种标记的概率。
- kneighbors([X,n_neighbors,return_distance]):返回样本点的k近邻点。如果return_diatance=True,同时还返回到这些近邻点的距离。
- kneighbors_graph([X,n_neighbors,model]):返回样本点的连接图。
2.Python实践kNN
代码:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import neighbors,datasets,cross_validation
#加载分类数据
def load_data():
digits = datasets.load_digits()
return cross_validation.train_test_split(digits.data,digits.target,test_size=0.25,random_state=0,stratify=digits.target)
#生成回归数据
def create_regression_data(n):
X = 5*np.random.rand(n,1)
y = np.sin(X).ravel()
y[::5] += 1*(0.5 - np.random.rand(int(n/5)))
return cross_validation.train_test_split(X,y,test_size=0.25,random_state=0)
#测试分类函数
def test_KNeighbors(*data):
X_train,X_test,y_train,y_test = data
clf = neighbors.KNeighborsClassifier()
clf.fit(X_train,y_train)
print 'Training Score:',clf.score(X_train,y_train)
print 'Testing Scors:',clf.score(X_test,y_test)
X_train,X_test,y_train,y_test = load_data()
test_KNeighbors(X_train,X_test,y_train,y_test)
'''
运行结果:
Training Score: 0.991091314031
Testing Scors: 0.98
'''
#测试k值and投票策略对结果的影响
def test_K_Weights(*data):
X_train,X_test,y_train,y_test = data
Ks = np.linspace(1,y_train.size,num=100,endpoint=False,dtype='int')
weights = ['uniform','distance']
fig = plt.figure()
ax=fig.add_subplot(1,1,1)
for weight in weights:
training_scores=[]
testing_scores=[]
for K in Ks:
clf = neighbors.KNeighborsClassifier(weights=weight,n_neighbors=K)
clf.fit(X_train,y_train)
training_scores.append(clf.score(X_train,y_train))
testing_scores.append(clf.score(X_test,y_test))
ax.plot(Ks,testing_scores,label='testing score:weight=%s'%weight)
ax.plot(Ks,training_scores,label='training score:weight=%s'%weight)
ax.legend(loc='best')
ax.set_xlabel("K")
ax.set_ylabel("score")
ax.set_ylim(0,1.05)
ax.set_title("KNeighborClassifier")
plt.show()
test_K_Weights(X_train,X_test,y_train,y_test)
#也可以看看p值不同时的影响,但是貌似区别不大,几乎是差不多的。
#此处的运行结果得现跑。。。。。。
#测试KNN回归算法
def test_KNeighborsRegressor(*data):
X_train,X_test,y_train,y_test= data
regr = neighbors.KNeighborsRegressor()
regr.fit(X_train,y_train)
print 'Training score:',regr.score(X_train,y_train)
print 'Testing score:',regr.score(X_test,y_test)
#print 'kneighbors_graph:',regr.kneighbors_graph([X_train,5,mode])
test_KNeighborsRegressor(X_train,X_test,y_train,y_test)
'''
运行结果:
Training score: 0.979070856523
Testing score: 0.951660029435
'''
3.kNN在scikit-learn中的源码解读:
由于scikit-learn本身是个很大并且复杂的机器学习算法库,所以看具体的kNN算法之前需要理清很多关系,这方面我找到了写的比较清晰的三篇文章:
https://blog.csdn.net/u014688145/article/details/61916582
https://blog.csdn.net/u014688145/article/details/62424762
https://blog.csdn.net/u014688145/article/details/64442996
根据这三篇文章,我还是想总结一下关于neighbors中的结构关系:
分类调用链:
分类预测与打分:
回归调用链:
回归预测与打分:
KNeighborsClassifier位于neighbors包下的classification.py文件下 ,源码如下:
"""最近邻分类"""
import numpy as np
from scipy import stats
from ..utils.extmath import weighted_mode
from .base import \
_check_weights, _get_weights, \
NeighborsBase, KNeighborsMixin,\
RadiusNeighborsMixin, SupervisedIntegerMixin
from ..base import ClassifierMixin
from ..utils import check_array
class KNeighborsClassifier(NeighborsBase, KNeighborsMixin,
SupervisedIntegerMixin, ClassifierMixin):
"""kNN分类使用的是投票法实现的.后面实际上给了一大堆的参数解释,翻译过来就在第一部分,我已经写出来了~~
当然也要注意,不止这一个算法实现,还有下面的:
--------
RadiusNeighborsClassifier
KNeighborsRegressor
RadiusNeighborsRegressor
NearestNeighbors
"""
#构造函数
def __init__(self, n_neighbors=5,
weights='uniform', algorithm='auto', leaf_size=30,
p=2, metric='minkowski', metric_params=None, n_jobs=1,
**kwargs):
super(KNeighborsClassifier, self).__init__(
n_neighbors=n_neighbors,
algorithm=algorithm,
leaf_size=leaf_size, metric=metric, p=p,
metric_params=metric_params,
n_jobs=n_jobs, **kwargs)
self.weights = _check_weights(weights)
def predict(self, X):
X = check_array(X, accept_sparse='csr')
neigh_dist, neigh_ind = self.kneighbors(X)
classes_ = self.classes_
_y = self._y
if not self.outputs_2d_:
_y = self._y.reshape((-1, 1))
classes_ = [self.classes_]
n_outputs = len(classes_)
n_samples = X.shape[0]
weights = _get_weights(neigh_dist, self.weights)
y_pred = np.empty((n_samples, n_outputs), dtype=classes_[0].dtype)
for k, classes_k in enumerate(classes_):
if weights is None:
mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
else:
mode, _ = weighted_mode(_y[neigh_ind, k], weights, axis=1)
mode = np.asarray(mode.ravel(), dtype=np.intp)
y_pred[:, k] = classes_k.take(mode)
if not self.outputs_2d_:
y_pred = y_pred.ravel()
return y_pred
def predict_proba(self, X):
X = check_array(X, accept_sparse='csr')
neigh_dist, neigh_ind = self.kneighbors(X)
classes_ = self.classes_
_y = self._y
if not self.outputs_2d_:
_y = self._y.reshape((-1, 1))
classes_ = [self.classes_]
n_samples = X.shape[0]
weights = _get_weights(neigh_dist, self.weights)
if weights is None:
weights = np.ones_like(neigh_ind)
all_rows = np.arange(X.shape[0])
probabilities = []
for k, classes_k in enumerate(classes_):
pred_labels = _y[:, k][neigh_ind]
proba_k = np.zeros((n_samples, classes_k.size))
# a simple ':' index doesn't work right
for i, idx in enumerate(pred_labels.T): # loop is O(n_neighbors)
proba_k[all_rows, idx] += weights[:, i]
# normalize 'votes' into real [0,1] probabilities
normalizer = proba_k.sum(axis=1)[:, np.newaxis]
normalizer[normalizer == 0.0] = 1.0
proba_k /= normalizer
probabilities.append(proba_k)
if not self.outputs_2d_:
probabilities = probabilities[0]
return probabilities
KNeighborsRegressor位于neighbors包下的Regression.py文件下 ,源码如下:
"""最近邻回归"""
import warnings
import numpy as np
from scipy.sparse import issparse
from .base import _get_weights, _check_weights, NeighborsBase, KNeighborsMixin
from .base import RadiusNeighborsMixin, SupervisedFloatMixin
from ..base import RegressorMixin
from ..utils import check_array
class KNeighborsRegressor(NeighborsBase, KNeighborsMixin,
SupervisedFloatMixin,
RegressorMixin):
"""基于k近邻的回归"""
def __init__(self, n_neighbors=5, weights='uniform',
algorithm='auto', leaf_size=30,
p=2, metric='minkowski', metric_params=None, n_jobs=1,
**kwargs):
super(KNeighborsRegressor, self).__init__(
n_neighbors=n_neighbors,
algorithm=algorithm,
leaf_size=leaf_size, metric=metric, p=p,
metric_params=metric_params, n_jobs=n_jobs, **kwargs)
self.weights = _check_weights(weights)
def predict(self, X):
if issparse(X) and self.metric == 'precomputed':
raise ValueError(
"Sparse matrices not supported for prediction with "
"precomputed kernels. Densify your matrix."
)
X = check_array(X, accept_sparse='csr')
neigh_dist, neigh_ind = self.kneighbors(X)
weights = _get_weights(neigh_dist, self.weights)
_y = self._y
if _y.ndim == 1:
_y = _y.reshape((-1, 1))
if weights is None:
y_pred = np.mean(_y[neigh_ind], axis=1)
else:
y_pred = np.empty((X.shape[0], _y.shape[1]), dtype=np.float64)
denom = np.sum(weights, axis=1)
for j in range(_y.shape[1]):
num = np.sum(_y[neigh_ind, j] * weights, axis=1)
y_pred[:, j] = num / denom
if self._y.ndim == 1:
y_pred = y_pred.ravel()
return y_pred
上面的源码并不是全部,而只是取了其中的一部分出来,而且果然看源码不要太深入细节,纠结~~~应该先建立一个大的框架,才比较容易梳理→_→