1 scikit-learn中的机器学习算法封装
k近邻算法的实现:
import numpy as np
from math import sqrt
from collections import Counter
def KNN_classify(k, x_train, y_train, x):
assert 1 <= k <= x_train.shape[0], 'k must be valid'
assert x_train.shape[0] == y_train.shape[0], 'the size of x_train must be equal to the size of y_train'
assert x_train.shape[1] == x.shape[0], 'the feature number of x must be equal to x_train'
distances = [sqrt(np.sum((x_train_i - x) ** 2)) for x_train_i in x_train]
nearest = np.argsort(distances)
top_k_y = [y_train[i] for i in nearest[:k]]
votes = Counter(top_k_y)
return votes.most_common(1)[0][0]
接着在jupyter notebook中调用KNN_classify:
import numpy as np
import matplotlib.pyplot as plt
raw_data_x = [[3.393533211, 2.331273381],
[3.110073483, 1.781539638],
[1.343808831, 3.368360954],
[3.582294042, 4.679179110],
[2.280362439, 2.866990263],
[7.423436942, 4.696522875],
[5.745051997, 3.533989803],
[9.172168622, 2.511101045],
[7.792783481, 3.424088941],
[7.939820817, 0.791637231]]
raw_data_y = [0,0,0,0,0,1,1,1,1,1]
x_train = np.array(raw_data_x)
y_train = np.array(raw_data_y)
x = np.array([[8.09, 3.36]])
%run KNN_function/KNN.py
predict_y = KNN_classify(6, x_train, y_train, x)
predict_y
输出:
>>>1
什么是机器学习:
k近邻算法是非常特殊的,可以被认为是没有模型的算法。为了和其他算法统一,可以认为训练数据集就是模型本身。
使用scikit learn中的knn:
from sklearn.neighbors import KNeighborsClassifier
KNN_classifier = KNeighborsClassifier(n_neighbors = 6)
KNN_classifier.fit(x_train, y_train)
y_predict = KNN_classifier.predict(x_predict)
y_predict[0]
输出:
>>>1
接下来将我们编写的knn算法整理成scikit learn的形式:
import numpy as np
from math import sqrt
from collections import Counter
class KNNClassifier:
def __init__(self, k):
"""初始化KNN分类其"""
assert k >= 1, 'k must be valid'
self.k = k
self._x_train = None
self._y_train = None
def fit(self, x_train, y_train):
"""根据x_train和y_train训练KNN分类器"""
assert x_train.shape[0] == y_train.shape[0], \
'the size of x_train must be equal to the size of y_train'
assert self.k <= x_train.shape[0], 'the size of x_train must be at least k'
self._x_train = x_train
self._y_train = y_train
return self
def predict(self, x_predict):
"""给定待预测的数据集x_predict,返回表示结果的向量"""
assert self._x_train is not None and self._y_train is not None, 'must fit before predict'
assert x_predict.shape[1] == self._x_train.shape[1], \
'the feature of x_predict must be equal to x_train'
y_predict = [self._predict(x) for x in x_predict]
return np.array(y_predict)
def _predict(self, x):
"""给定单个待预测的数据x,返回预测结果"""
assert x.shape[0] == self._x_train.shape[1], 'the feature of x must be equal to x_train'
distances = [sqrt(np.sum((x_train_i - x) ** 2)) for x_train_i in self._x_train]
nearest = np.argsort(distances)
top_k_y = [self._y_train[i] for i in nearest[:self.k]]
votes = Counter(top_k_y)
return votes.most_common(1)[0][0]
def __repr__(self):
return 'KNN(k = {})'.format(self.k)
在jupyter notebook中输入:
%run KNN_function/KNN_01.py
knn_clf = KNNClassifier(k = 6)
knn_clf.fit(x_train, y_train)
>>>KNN(k = 6)
y_predict = knn_clf.predict(x_predict)
y_predict
输出:
>>>array([1])
2 训练数据集,测试数据集
这节课学习将数据集分解成测试数据集和训练数据集。
在jupyter notebook中输入:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
iris = datasets.load_iris() # 使用鸢尾花数据集
x = iris.data
y = iris.target
x.shape
>>>(150, 4)
y.shape
>>>(150,)
接下来进行train_test_split:
shuffle_indexes = np.random.permutation(len(x)) # 对x的索引进行随机排列
shuffle_indexes
>>>array([ 58, 97, 27, 61, 104, 23, 88, 39, 44, 148, 54, 64, 18,
69, 62, 75, 145, 99, 126, 113, 28, 101, 128, 11, 132, 36,
47, 9, 143, 135, 106, 122, 76, 1, 133, 38, 73, 90, 121,
134, 70, 43, 71, 136, 5, 17, 10, 100, 120, 0, 20, 60,
129, 147, 119, 87, 53, 40, 68, 7, 123, 30, 29, 108, 59,
111, 86, 25, 21, 6, 91, 8, 45, 85, 80, 103, 116, 131,
142, 107, 22, 118, 141, 56, 4, 109, 139, 16, 52, 138, 78,
67, 2, 51, 63, 146, 14, 66, 112, 144, 117, 35, 12, 50,
84, 55, 77, 149, 92, 3, 105, 19, 74, 127, 41, 125, 79,
110, 72, 46, 42, 96, 114, 82, 83, 89, 81, 15, 31, 32,
140, 130, 98, 37, 49, 95, 115, 102, 48, 65, 34, 57, 24,
93, 33, 124, 137, 26, 94, 13])
test_ratio = 0.2 # 测试数据集所占比例
test_size = int(len(x) * test_ratio)
test_size
>>>30
test_index = shuffle_indexes[:test_size]
train_index = shuffle_indexes[test_size:]
x_train = x[train_index]
y_train = y[train_index]
x_test = x[test_index]
y_test = y[test_index]
接下来在pycharm中整理train_test_split算法:
import numpy as np
def train_test_split(x, y, test_ratio = 0.2, seed = None):
"""将数据x和y按test_ratio分成x_train,x_test,y_train,y_test"""
assert x.shape[0] == y.shape[0], 'the size of x must be equal to the size of y'
assert 0 <= test_ratio <= 1, 'the test_ratio must be valid'
if seed:
np.random.seed(seed)
shuffle_indexes = np.random.permutation(len(x)) # 对x的索引进行随机排列
test_size = int(len(x) * test_ratio)
test_index = shuffle_indexes[:test_size]
train_index = shuffle_indexes[test_size:]
x_train = x[train_index]
y_train = y[train_index]
x_test = x[test_index]
y_test = y[test_index]
return x_train, x_test, y_train, y_test
在jupyter notebook中使用我们的算法:
from KNN_function.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y)
# 调用自己写的knn分类器
from KNN_function.KNN_01 import KNNClassifier
my_knn_clf = KNNClassifier(k = 3)
my_knn_clf.fit(x_train, y_train)
>>>KNN(k = 3)
x_test_predict = my_knn_clf.predict(x_test)
x_test_predict # 预测结果
>>>array([1, 1, 2, 0, 1, 2, 2, 2, 1, 1, 1, 2, 0, 1, 2, 2, 0, 2, 1, 2, 1, 2,
0, 2, 1, 1, 1, 2, 2, 1])
y_test # 真实结果
>>>array([2, 1, 2, 0, 1, 2, 2, 2, 1, 1, 1, 2, 0, 1, 2, 2, 0, 2, 1, 2, 1, 2,
0, 2, 1, 1, 1, 2, 2, 1])
计算预测准确率:
sum(x_test_predict == y_test) / len(y_test)
>>>0.9666666666666667
使用sklearn 中的train_test_split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y)
3 分类准确度
这节课使用手写数字识别数据集
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn import datasets
digits = datasets.load_digits()
x = digits.data
x.shape
>>>(1797, 64)
y = digits.target
y.shape
>>>(1797,)
digits.target_names # 标签名称
>>>array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
# 从x中取出一个数据
some_digits = x[666]
some_digits
>>>array([ 0., 0., 5., 15., 14., 3., 0., 0., 0., 0., 13., 15., 9.,
15., 2., 0., 0., 4., 16., 12., 0., 10., 6., 0., 0., 8.,
16., 9., 0., 8., 10., 0., 0., 7., 15., 5., 0., 12., 11.,
0., 0., 7., 13., 0., 5., 16., 6., 0., 0., 0., 16., 12.,
15., 13., 1., 0., 0., 0., 6., 16., 12., 2., 0., 0.])
y[666] # 这个数据对应的标签
>>>0
# 将这个数据可视化
some_digits_image = some_digits.reshape(8, 8)
plt.imshow(some_digits_image, cmap = matplotlib.cm.binary)
plt.show()
输出:
调用自己编写的train_test_split方法:
from KNN_function.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y)
调用自己编写的knn分类器:
from KNN_function.KNN_01 import KNNClassifier
my_knn_clf = KNNClassifier(k = 3)
my_knn_clf.fit(x_train, y_train)
>>>KNN(k = 3)
x_test_predict = my_knn_clf.predict(x_test)
x_test_predict # 预测结果
>>>array([7, 9, 8, 7, 8, 2, 1, 6, 2, 2, 5, 6, 2, 0, 4, 7, 4, 7, 8, 3, 7, 7,
9, 2, 5, 4, 7, 5, 2, 9, 1, 0, 6, 6, 0, 8, 9, 6, 3, 8, 1, 8, 5, 3,
6, 5, 4, 3, 4, 2, 8, 3, 4, 8, 2, 2, 7, 2, 8, 5, 8, 3, 7, 8, 5, 9,
1, 2, 1, 0, 4, 3, 6, 5, 0, 4, 0, 5, 0, 3, 7, 6, 6, 9, 7, 1, 9, 3,
6, 2, 5, 3, 1, 9, 8, 2, 1, 1, 9, 7, 2, 5, 5, 1, 3, 0, 6, 3, 0, 2,
2, 4, 2, 6, 9, 4, 4, 0, 3, 5, 9, 3, 9, 8, 8, 2, 8, 7, 6, 1, 6, 1,
7, 3, 6, 5, 5, 7, 3, 6, 4, 3, 6, 0, 4, 1, 2, 6, 2, 8, 9, 0, 2, 6,
6, 1, 7, 7, 8, 0, 6, 8, 4, 7, 4, 0, 8, 0, 8, 4, 0, 6, 4, 3, 1, 8,
8, 9, 4, 4, 3, 8, 4, 5, 1, 2, 1, 1, 8, 2, 6, 1, 7, 6, 3, 7, 1, 2,
6, 2, 1, 3, 0, 2, 2, 4, 1, 3, 7, 6, 9, 6, 3, 9, 7, 3, 8, 3, 4, 6,
7, 3, 2, 6, 1, 9, 4, 5, 8, 1, 9, 4, 9, 0, 5, 4, 3, 6, 5, 1, 5, 3,
8, 3, 3, 7, 7, 1, 4, 5, 1, 8, 9, 9, 8, 7, 7, 4, 5, 5, 1, 3, 2, 5,
1, 8, 6, 3, 4, 2, 5, 5, 2, 1, 1, 6, 3, 3, 4, 8, 6, 4, 9, 2, 3, 1,
9, 5, 1, 1, 3, 3, 5, 1, 9, 7, 4, 2, 5, 5, 2, 4, 6, 5, 2, 6, 9, 9,
5, 8, 7, 8, 9, 0, 5, 7, 1, 5, 7, 7, 8, 5, 3, 6, 8, 9, 2, 5, 1, 2,
1, 7, 5, 5, 3, 2, 4, 0, 5, 3, 6, 2, 9, 0, 8, 7, 3, 8, 0, 0, 3, 9,
8, 8, 9, 4, 8, 7, 4])
y_test # 真实值
>>>array([7, 9, 8, 7, 8, 2, 1, 6, 2, 2, 5, 6, 2, 0, 4, 7, 4, 7, 8, 3, 7, 7,
9, 2, 5, 4, 7, 5, 2, 9, 1, 0, 6, 6, 0, 8, 9, 6, 3, 8, 1, 8, 5, 3,
6, 5, 4, 3, 4, 2, 8, 3, 4, 8, 2, 2, 7, 2, 8, 5, 8, 8, 7, 8, 5, 9,
1, 2, 1, 0, 4, 3, 6, 5, 0, 4, 0, 5, 0, 3, 7, 6, 6, 9, 7, 1, 9, 3,
6, 2, 5, 3, 1, 9, 8, 2, 1, 1, 9, 7, 2, 5, 5, 1, 3, 0, 6, 3, 0, 2,
2, 4, 2, 6, 9, 4, 4, 0, 3, 5, 9, 3, 9, 8, 8, 2, 8, 7, 6, 1, 6, 1,
7, 3, 6, 5, 5, 7, 3, 6, 4, 3, 6, 0, 4, 1, 2, 6, 2, 8, 9, 0, 2, 6,
6, 1, 7, 7, 8, 0, 6, 8, 4, 7, 4, 0, 8, 0, 8, 4, 0, 6, 4, 3, 1, 8,
8, 9, 4, 4, 3, 8, 4, 5, 1, 2, 1, 1, 8, 2, 6, 1, 7, 6, 3, 7, 1, 2,
6, 2, 1, 3, 0, 2, 2, 4, 1, 3, 7, 6, 9, 6, 3, 9, 7, 3, 8, 3, 4, 6,
7, 3, 2, 6, 1, 9, 4, 5, 8, 1, 9, 4, 9, 0, 5, 4, 3, 6, 5, 9, 5, 3,
8, 3, 3, 7, 7, 1, 4, 5, 1, 8, 9, 9, 8, 7, 7, 4, 5, 5, 1, 3, 2, 5,
1, 8, 6, 3, 4, 2, 5, 5, 2, 1, 1, 6, 3, 3, 4, 8, 6, 4, 9, 2, 3, 1,
9, 5, 1, 1, 3, 3, 5, 1, 9, 7, 4, 2, 5, 5, 2, 4, 6, 5, 2, 6, 9, 9,
5, 8, 7, 8, 9, 0, 5, 7, 1, 5, 7, 7, 8, 5, 3, 6, 8, 9, 2, 5, 1, 2,
1, 7, 5, 5, 3, 2, 4, 0, 5, 3, 6, 2, 5, 0, 8, 7, 3, 8, 0, 0, 3, 9,
8, 8, 9, 4, 8, 7, 4])
# 计算预测准确率
sum(x_test_predict == y_test) / len(y_test)
>>>0.9916434540389972
将计算准确度在pycharm中整理成算法,保存为metrics.py:
import numpy as np
def accuracy_score(y_true, y_predict):
"""计算预测准确度"""
assert y_true.shape[0] == y_predict.shape[0], 'the shape of y_true must be equal to y_predict'
return sum(y_true == y_predict) / len(y_true)
有时候我们并不关心预测值为多少,只想知道模型预测准确度怎么样。这时可以在我们编写的KNNClassifier中添加一个score方法:
import numpy as np
from math import sqrt
from collections import Counter
from .metrics import accuracy_score
class KNNClassifier:
def __init__(self, k):
"""初始化KNN分类其"""
assert k >= 1, 'k must be valid'
self.k = k
self._x_train = None
self._y_train = None
def fit(self, x_train, y_train):
"""根据x_train和y_train训练KNN分类器"""
assert x_train.shape[0] == y_train.shape[0], \
'the size of x_train must be equal to the size of y_train'
assert self.k <= x_train.shape[0], 'the size of x_train must be at least k'
self._x_train = x_train
self._y_train = y_train
return self
def predict(self, x_predict):
"""给定待预测的数据集x_predict,返回表示结果的向量"""
assert self._x_train is not None and self._y_train is not None, 'must fit before predict'
assert x_predict.shape[1] == self._x_train.shape[1], \
'the feature of x_predict must be equal to x_train'
y_predict = [self._predict(x) for x in x_predict]
return np.array(y_predict)
def _predict(self, x):
"""给定单个待预测的数据x,返回预测结果"""
assert x.shape[0] == self._x_train.shape[1], 'the feature of x must be equal to x_train'
distances = [sqrt(np.sum((x_train_i - x) ** 2)) for x_train_i in self._x_train]
nearest = np.argsort(distances)
top_k_y = [self._y_train[i] for i in nearest[:self.k]]
votes = Counter(top_k_y)
return votes.most_common(1)[0][0]
def score(self, x_test, y_test):
"""在不计算出预测值时直接计算准确度"""
y_predict = self.predict(x_test)
return accuracy_score(y_test, y_predict)
def __repr__(self):
return 'KNN(k = {})'.format(self.k)
在jupyter notebook中调用自己编写的计算预测准确率算法:
from KNN_function.metrics import accuracy_score
accuracy_score(y_test, x_test_predict)
>>>0.9916434540389972
# 调用封装好的score方法
my_knn_clf.score(x_test, y_test)
>>>0.9916434540389972
scikit-learn中的accuracy_score:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 666)
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors = 3)
knn_clf.fit(x_train, y_train)
>>>KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=3, p=2,
weights='uniform')
x_test_predict = knn_clf.predict(x_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, x_test_predict)
>>>0.9844444444444445
# sklearn中的score方法
knn_clf.score(x_test, y_test)
>>>0.9844444444444445
4 超参数
超参数和模型参数
·超参数:在算法运行前需要决定的参数
·模型参数:算法过程中学习的参数
kNN算法没有模型参数
kNN算法中的k是典型的超参数
import numpy as np
from sklearn import datasets
digits = datasets.load_digits()
x = digits.data
y = digits.target
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 666)
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors = 3)
knn_clf.fit(x_train, y_train)
knn_clf.score(x_test, y_test)
>>>0.9844444444444445
取不同的k值进行计算,找出准确率最大的那个k值:
best_k = -1
best_score = 0
for k in range(1, 11):
knn_clf = KNeighborsClassifier(n_neighbors = k)
knn_clf.fit(x_train, y_train)
score = knn_clf.score(x_test, y_test)
if score > best_score:
best_score = score
best_k = k
print('best_score:', best_score)
print('best_k:', best_k)
输出:
best_score: 0.9866666666666667
best_k: 5
在使用k近邻算法是,有时会出现一下情况:
距离绿色点最近的,红色点一个,蓝色点两个。此时应该判断该点属于蓝色点,但是该点距离红色点非常近。所以说,如果考虑距离,该点属于红色。
又比如一下情况:
距离绿色点最近的分别有三个,此时不考虑距离无法做出判断。
best_method = ''
best_k = -1
best_score = 0
for method in ['uniform', 'distance']: # uniform表示不考虑距离,distance表示考虑距离
for k in range(1, 11):
knn_clf = KNeighborsClassifier(n_neighbors = k, weights = method)
knn_clf.fit(x_train, y_train)
score = knn_clf.score(x_test, y_test)
if score > best_score:
best_method = method
best_score = score
best_k = k
print('best_method:', best_method)
print('best_score:', best_score)
print('best_k:', best_k)
输出:
best_method: uniform
best_score: 0.9866666666666667
best_k: 5
更多关于距离的定义:
欧拉距离
曼哈顿距离:
明科夫斯基距离:
当p=1时,相当于曼哈顿距离;
当p=2时,相当于欧拉距离
.寻找明科夫斯基距离中最好的p值:
%%time
best_p = -1
best_k = -1
best_score = 0
for k in range(1, 11):
for p in range(1, 6):
knn_clf = KNeighborsClassifier(n_neighbors = k, weights = 'distance', p = p)
knn_clf.fit(x_train, y_train)
score = knn_clf.score(x_test, y_test)
if score > best_score:
best_p = p
best_score = score
best_k = k
print('best_p:', best_p)
print('best_score:', best_score)
print('best_k:', best_k)
输出:
best_p: 2
best_score: 0.9866666666666667
best_k: 5
Wall time: 39.3 s
5 网格搜索与k近邻算法中更多超参数
导入相关包及数据准备:
import numpy as np
from sklearn import datasets
digits = datasets.load_digits()
x = digits.data
y = digits.target
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 666)
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors = 4)
knn_clf.fit(x_train, y_train)
knn_clf.score(x_test, y_test)
>>>0.9844444444444445
下面开始网格搜索的过程:先定义一个param_grid,它是一个列表,内容是两个字典,每个字典对应一组要搜索的参数
param_grid = [
{
'weights':['uniform'],
'n_neighbors':[k for k in range(1, 11)]
},
{
'weights':['distance'],
'n_neighbors':[k for k in range(1, 11)],
'p':[p for p in range(1, 6)]
}
]
knn_clf = KNeighborsClassifier()
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(knn_clf, param_grid)
%%time
grid_search.fit(x_train, y_train)
输出:
Wall time: 1min 28s
Out[5]:
GridSearchCV(cv='warn', error_score='raise-deprecating',
estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
metric='minkowski',
metric_params=None, n_jobs=None,
n_neighbors=5, p=2,
weights='uniform'),
iid='warn', n_jobs=None,
param_grid=[{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'weights': ['uniform']},
{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'p': [1, 2, 3, 4, 5], 'weights': ['distance']}],
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=0)
查看grid_search的相关内容:
grid_search.best_estimator_ # 返回搜索到的最佳参数对应的knn分类器
>>>KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=5, p=3,
weights='distance')
grid_search.best_score_ # 返回最佳参数对应的准确率
>>>0.9866369710467706
grid_search.best_params_ # 返回自己定义的param_grid中的最佳参数
>>>{'n_neighbors': 5, 'p': 3, 'weights': 'distance'}
如何使用搜索到的最佳参数对应的分类器呢?
knn_clf = grid_search.best_estimator_
knn_clf.score(x_test, y_test)
>>>0.9822222222222222
GridSearchCV中还可以传入其他参数
%%time
# n_jobs表示用几个核参与网格搜索,-1表示将全部的核都用于搜索
# verbose表示边搜索边输出相关信息,值越大输出信息越详细
grid_search = GridSearchCV(knn_clf, param_grid, n_jobs = -1, verbose = 3)
grid_search.fit(x_train, y_train)
输出:
Fitting 3 folds for each of 60 candidates, totalling 180 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 24 tasks | elapsed: 41.0s
[Parallel(n_jobs=-1)]: Done 120 tasks | elapsed: 1.0min
Wall time: 1min 17s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 1.3min finished
Out[11]:
GridSearchCV(cv='warn', error_score='raise-deprecating',
estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
metric='minkowski',
metric_params=None, n_jobs=None,
n_neighbors=5, p=3,
weights='distance'),
iid='warn', n_jobs=-1,
param_grid=[{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'weights': ['uniform']},
{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'p': [1, 2, 3, 4, 5], 'weights': ['distance']}],
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=3)
更多的距离定义:
●向量空间余弦相似度Cosine Similarity
●调整余玄相似度Adjusted Cosine Similarity
●皮尔森相关系数Pearson Correlation Coefficient
●Jaccard相似系数Jaccard Coefficient
6 数据归一化
数据归一化:将所有的数据映射到同一尺度,
最值归一化:把所有数据映射到0-1之间
均值方差归一化:
接下来在jupyter notebook中实现以上两种归一化方法:
import numpy as np
import matplotlib.pyplot as plt
最值归一化:
# 对向量的最值归一化
x = np.random.randint(0, 100, size = 100)
x
>>>array([ 6, 29, 60, 26, 77, 75, 36, 90, 95, 65, 31, 67, 72, 15, 34, 56, 57,
87, 14, 38, 77, 67, 7, 11, 35, 14, 28, 56, 18, 56, 75, 75, 0, 44,
70, 44, 91, 92, 67, 14, 16, 96, 18, 67, 71, 86, 48, 89, 92, 34, 36,
71, 41, 29, 59, 85, 74, 12, 87, 32, 11, 83, 94, 60, 67, 31, 15, 15,
57, 49, 62, 59, 82, 23, 82, 6, 55, 96, 56, 46, 16, 87, 35, 47, 43,
33, 6, 27, 95, 42, 33, 66, 22, 31, 0, 76, 90, 13, 15, 14])
(x - np.min(x)) / (np.max(x) - np.min(x))
>>>array([0.0625 , 0.30208333, 0.625 , 0.27083333, 0.80208333,
0.78125 , 0.375 , 0.9375 , 0.98958333, 0.67708333,
0.32291667, 0.69791667, 0.75 , 0.15625 , 0.35416667,
0.58333333, 0.59375 , 0.90625 , 0.14583333, 0.39583333,
0.80208333, 0.69791667, 0.07291667, 0.11458333, 0.36458333,
0.14583333, 0.29166667, 0.58333333, 0.1875 , 0.58333333,
0.78125 , 0.78125 , 0. , 0.45833333, 0.72916667,
0.45833333, 0.94791667, 0.95833333, 0.69791667, 0.14583333,
0.16666667, 1. , 0.1875 , 0.69791667, 0.73958333,
0.89583333, 0.5 , 0.92708333, 0.95833333, 0.35416667,
0.375 , 0.73958333, 0.42708333, 0.30208333, 0.61458333,
0.88541667, 0.77083333, 0.125 , 0.90625 , 0.33333333,
0.11458333, 0.86458333, 0.97916667, 0.625 , 0.69791667,
0.32291667, 0.15625 , 0.15625 , 0.59375 , 0.51041667,
0.64583333, 0.61458333, 0.85416667, 0.23958333, 0.85416667,
0.0625 , 0.57291667, 1. , 0.58333333, 0.47916667,
0.16666667, 0.90625 , 0.36458333, 0.48958333, 0.44791667,
0.34375 , 0.0625 , 0.28125 , 0.98958333, 0.4375 ,
0.34375 , 0.6875 , 0.22916667, 0.32291667, 0. ,
0.79166667, 0.9375 , 0.13541667, 0.15625 , 0.14583333])
# 对矩阵的最值归一化
x = np.random.randint(0, 100, size = (10, 2))
x
>>>array([[ 6, 10],
[78, 17],
[49, 92],
[14, 90],
[30, 23],
[ 7, 51],
[ 2, 37],
[90, 61],
[49, 78],
[77, 15]])
# 因为要归一化到0到1之间,所以x需要能够存储浮点型数据
x = np.array(x, dtype = float)
x
>>>array([[ 6., 10.],
[78., 17.],
[49., 92.],
[14., 90.],
[30., 23.],
[ 7., 51.],
[ 2., 37.],
[90., 61.],
[49., 78.],
[77., 15.]])
x[:, 0] = (x[:, 0] - np.min(x[:, 0])) / (np.max(x[:, 0]) - np.min(x[:, 0]))
x[:, 1] = (x[:, 1] - np.min(x[:, 1])) / (np.max(x[:, 1]) - np.min(x[:, 1]))
x[:, 0]
>>>array([0.04545455, 0.86363636, 0.53409091, 0.13636364, 0.31818182,
0.05681818, 0. , 1. , 0.53409091, 0.85227273])
x[:, 1]
>>>array([0. , 0.08536585, 1. , 0.97560976, 0.15853659,
0.5 , 0.32926829, 0.62195122, 0.82926829, 0.06097561])
# 可视化
plt.scatter(x[:, 0], x[:, 1])
plt.show()
输出:
均值方差归一化:
x2 = np.random.randint(0, 100, size = (10, 2))
x2 = np.array(x2, dtype = float)
x2[:, 0] = (x2[:, 0] - np.mean(x2[:, 0])) / np.std(x2[:, 0])
x2[:, 1] = (x2[:, 1] - np.mean(x2[:, 1])) / np.std(x2[:, 1])
x2[:, 0]
>>>array([-1.55310529, 0.28178704, 0.0524255 , 0.70774418, -0.93055254,
0.80604199, -0.66842506, -1.29097782, 1.23199913, 1.36306287])
x2[:, 1]
>>>array([-1.65306027, 0.02969569, 0.82158085, -0.31675406, 0.72259521,
-0.41573971, -1.85103156, 0.9205665 , 0.9205665 , 0.82158085])
# 可视化
plt.scatter(x2[:, 0], x2[:, 1])
plt.show()
输出:
查看均值与方差:
# 均值为0,方差为1
print(np.mean(x2[:, 0]))
print(np.mean(x2[:, 1]))
print(np.std(x2[:, 0]))
print(np.std(x2[:, 1]))
输出:
-2.220446049250313e-16
1.554312234475219e-16
1.0
1.0000000000000002
7 scikit-learn中的Scaler
我们会将数据集分成训练数据集和测试数据集。将训练数据集的平均值记为mean_train,方差记为std_train,使用这两者将训练数据集归一化。那能不能使用测试数据集的均值方差对测试数据集归一化呢?答案是不能,应该使用mean_train和std_train对测试数据集归一化:(x_ test - mean_ train) / std train
测试数据是模拟真实环境:
●真实环境很有可能无法得到所有测试数据的均值和方差
●对数据的归一化也是算法的一部分
(x_ test - mean_ train) / std train
在jupyter notebook中输入:
import numpy as np
from sklearn import datasets
iris = datasets.load_iris()
x = iris.data
y = iris.target
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 666)
调用scikit_learn中的standardscaler:
from sklearn.preprocessing import StandardScaler
standardscaler = StandardScaler()
standardscaler.fit(x_train)# 输入x_train,计算归一化要用到的均值和方差
>>>StandardScaler(copy=True, with_mean=True, with_std=True)
standardscaler.mean_ # 查看计算出的均值
>>>array([5.825 , 3.09285714, 3.68571429, 1.16428571])
standardscaler.scale_ # 查看方差
>>>array([0.80239597, 0.4493476 , 1.75828941, 0.75543946])
x_train_standard = standardscaler.transform(x_train) # 将x_train归一化
x_test_standard = standardscaler.transform(x_test) # 将x_test归一化
# 调用sklearn中的knn分类器
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors = 3)
knn_clf.fit(x_train_standard, y_train)
>>>KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=3, p=2,
weights='uniform')
knn_clf.score(x_test_standard, y_test)
>>>0.9736842105263158
接下来在pycharm中整理均值方差归一化算法:
import numpy as np
class StandardScaler:
def __init__(self):
self.mean_ = None
self.scale_ = None
def fit(self, x_train):
"""根据传入的x_train计算均值与方差"""
assert x_train.ndim == 2, 'the size of x_train must be 2'
self.mean_ = np.array([np.mean(x_train[:, i]) for i in range(x_train.shape[1])])
self.scale_ = np.array([np.std(x_train[:, i]) for i in range(x_train.shape[1])])
return self
def transform(self, x_train):
"""将传入的数据进行均值方差归一化处理"""
assert x_train.ndim == 2, 'the size of x_train must be 2'
assert self.mean_ is not None and self.scale_ is not None, 'must fit before transform'
assert x_train.shape[1] == len(self.mean_), 'the number of x_train must be equal to mean_and scale_'
res_x = np.empty(shape = x_train.shape, dtype = float)
for col in range(x_train.shape[1]):
res_x[:, col] = (x_train[:, col] - self.mean_[col]) / self.scale_[col]
return res_x
在jupyter notebook中调用我们自己编写的均值方差归一化算法:
from KNN_function.preprocessing import StandardScaler
standardscaler = StandardScaler()
standardscaler.fit(x_train)
>>><KNN_function.preprocessing.StandardScaler at 0x6640e31b00>
x_train_standard = standardscaler.transform(x_train)
standardscaler.mean_ # 查看计算出的均值
>>>array([5.825 , 3.09285714, 3.68571429, 1.16428571])
standardscaler.scale_ # 查看计算出的方差
>>>array([0.80239597, 0.4493476 , 1.75828941, 0.75543946])
# 调用sklearn中的knn分类器
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors = 3)
knn_clf.fit(x_train_standard, y_train)
>>>KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=3, p=2,
weights='uniform')
knn_clf.score(x_test_standard, y_test)
>>> 0.9736842105263158
8 总结
最大的缺点:效率低下
如果训练集有m个样本,n个特征,则预测每一个新的数据,需要O(m*n)
优化:使用树结构: KD-Tree, Ball-Tree
缺点2:高度数据相关
缺点3:预测结果不具有可解释性
缺点4:维数灾难
什么是维数灾难:随着维度的增加,“看似相近”的两个点之间的距离越来越大
解决方法:降维