1.knn算法的超参数问题
"""
超参数 :运行机器学习算法之前需要指定的参数
模型参数:算法过程中学习的参数
kNN算法没有模型参数
kNN算法中的k是典型的超参数
寻找最好的k
"""
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
digits = datasets.load_digits()
# 数据矩阵
X = digits.data
# 特征
Y = digits.target
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
best_score = 0.0
best_k = -1
best_method = ""
# for k in range(1, 11):
# kNeighborsClassifier = KNeighborsClassifier(n_neighbors=k)
# kNeighborsClassifier.fit(x_train, y_train)
# score = kNeighborsClassifier.score(x_test, y_test)
# if score > best_score:
# best_k = k
# best_score = score
for method in ["uniform", "distance"]:
for k in range(1, 11):
kNeighborsClassifier = KNeighborsClassifier(n_neighbors=k, weights=method)
kNeighborsClassifier.fit(x_train, y_train)
score = kNeighborsClassifier.score(x_test, y_test)
if score > best_score:
best_k = k
best_score = score
best_method = method
print(best_k)
print(best_score)
print(best_method)
2.使用GridSearchCV
"""
Grid Search
"""
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
param_grid = [
{
'weights': ['uniform'],
'n_neighbors': [i for i in range(1, 11)],
},
{
'weights': ['distance'],
'n_neighbors': [i for i in range(1, 11)],
'p': [i for i in range(1, 6)]
}
]
digits = datasets.load_digits()
# 数据矩阵
X = digits.data
# 特征
Y = digits.target
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
kNeighborsClassifier = KNeighborsClassifier()
grid_search = GridSearchCV(kNeighborsClassifier, param_grid, verbose=2)
grid_search.fit(x_train, y_train)
result = grid_search.best_estimator_
best_score = grid_search.best_score_
best_params = grid_search.best_params_m
print(result)
print(best_score)
print(best_params)
3.为什么要数据归一化
肿瘤大小 (厘米) | 发现时间(天) | |
---|---|---|
样本1 | 1 | 200 |
样本2 | 5 | 100 |
样本间的距离被发现时间所主导
数据归一化
解决方案:将所有的数据映射到同一尺度
最值归一化 (normalization): 把所有的数据映射到0-1之间
x(scale) = (x - x(min))/(x(max) -x(min))
适用于分布明显边界的情况;受outlier 影响 较大
均值方差归一法:把所有数据归一到均值为0方差为1的分布中
适用于分布没有明显的边界;可能存在极端数据
x(scale) =(x-x(mean))/S
注: x(mean) 均值 S:方差
案例
import numpy as np
x = np.random.randint(0, 100, size=100)
# 最值归一化
x_data = (x - np.min(x)) / (np.max(x) - np.min(x))
X = np.random.randint(0, 100, (50, 2))
X = np.array(X, dtype=float)
X[:, 0] = (X[:, 0] - np.min(X)) / (np.max(X[:, 0]) - np.min(X[:, 0]))
X[:, 1] = (X[:, 1] - np.min(X)) / (np.max(X[:, 1]) - np.min(X[:, 1]))
# 均值方差归一化 Standardization
X2 = np.random.randint(0, 100, (50, 2))
X2 = np.array(X2, dtype=float)
X2[:, 0] = (X2[:, 0] - np.mean(X2[:, 0])) / np.std(X2[:, 0])
X2[:, 1] = (X2[:, 1] - np.mean(X2[:, 1])) / np.std(X2[:, 1])
print(np.mean(X2[:, 0]))
print(np.std(X2[:, 0]))
对测试数据集归一化
将测试数据集 使用 训练数据得到的mean_train 以及std_train相应的进行归一化
(x_test-mean_train) / std_train
得到测试数据集归一化的结果
测试数据是模拟真实环境
真实环境很有可能无法得到所有的测试数据的均值和方差
对数据的归一化也是算法的一部分
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
iris = datasets.load_iris()
X = iris.data
y = iris.target
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)
standardScaler = StandardScaler()
# fit 之后 已经存放了均值方差归一化相关信息
standardScaler.fit(x_train)
x_train = standardScaler.transform(x_train)
x_test_standard = standardScaler.transform(x_test)
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(x_train, y_train)
score = knn_clf.score(x_test_standard, y_test)
print(standardScaler.mean_)
print(standardScaler.scale_)
print(score) # 1.0
归一化处理后 成功率为1.0