shuxuemoxing_knn-CSDN博客

本文链接：https://blog.csdn.net/xxuffei/article/details/104785784

一案例1(Ⅱ 肿瘤预测案例)

【案例】给定10组肿瘤相关数据的特征值[3.3935, 2.3312], [3.1101, 1.7815], [1.3438, 3.3684], [3.5823, 4.6792], [2.2804, 2.8670], [7.4234, 4.6965], [5.7451, 3.5340], [9.1722, 2.5111], [7.7928, 3.4241], [7.9398, 0.7916]，还有其对应的目标值[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]（0代表良性肿瘤，1代表恶性肿瘤）。先给出一个待测样本[8.0936, 3.3657]，试使用kNN算法推断其目标值。

1.1 导入数据集，将数据集转换为Numpy数组，并绘制训练数据集。

import numpy as np
import matplotlib.pyplot as plt
row_data_X = [[3.3935, 2.3312],
              [3.1101, 1.7815],
              [1.3438, 3.3684],
              [3.5823, 4.6792],
              [2.2804, 2.8670],
              [7.4234, 4.6965],
              [5.7451, 3.5340],
              [9.1722, 2.5111],
              [7.7928, 3.4241],
              [7.9398, 0.7916]]
# 0：良性肿瘤，1：恶性肿瘤
row_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
X_train = np.array(row_data_X)
y_train = np.array(row_data_y)
plt.scatter(X_train[y_train==0,0], X_train[y_train==0,1], color="g",s=180)
plt.scatter(X_train[y_train==1,0], X_train[y_train==1,1], color="r",s=180)
plt.show()

png

1.2 给定待预测数据

# 待预测数据
x = np.array([8.0936, 3.3657])
x[0],x[1]

(8.0936, 3.3657)

plt.scatter(X_train[y_train==0,0], X_train[y_train==0,1], color="g",s=180)
plt.scatter(X_train[y_train==1,0], X_train[y_train==1,1], color="r",s=180)
plt.scatter(x[0],x[1],color='y',s=180)
plt.show()

png

2 KNN的过程

2.1 计算各个样本点和待测点的距离

distances = []
# 计算样本点和待测点的距离
for x_train in X_train:
    d = ((np.sum((x_train - x) ** 2))) ** 0.5
    distances.append(d)

distances  # 要求的点与训练集各点的距离

[4.81260119478022,
 5.229241043401997,
 6.749800540015979,
 4.698628516918528,
 5.834551904816684,
 1.4900324425998253,
 2.354522698977439,
 1.3761246745843922,
 0.30641670972713025,
 2.578690607653427]

2.2 对距离进行排序，返回排序好的索引值

# 返回排序好的数组的索引
nearest = np.argsort(distances)
nearest

array([8, 7, 5, 6, 9, 3, 0, 1, 4, 2], dtype=int64)

nearest[0:6]

array([8, 7, 5, 6, 9, 3], dtype=int64)

y_train

array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])

[y_train[i] for i in [8, 7, 5, 6, 9, 3]]

[1, 1, 1, 1, 1, 0]

2.3 给定k，查看前k个样例的特征值。

k = 6
topK_y = [y_train[i] for i in nearest[:k]]
topK_y
# topK_y为[1, 1, 1, 1, 1, 0]

[1, 1, 1, 1, 1, 0]

3 统计

3.1 构建一个存放结果的字典


from collections import Counter
result_dict = Counter(topK_y)   # Counter({1: 5, 0: 1})
result_dict

Counter({1: 5, 0: 1})

3.2 进行投票表决（即对前k个进行表决，比如6个结果中5个为0一个为1则预测值采取0）

# result_dict.most_common()    # [(1, 5), (0, 1)]
# result_dict.most_common(1)    # [(1, 5)]
# result_dict.most_common()[0]  # (1, 5)
predict_y = result_dict.most_common()[0][0]
predict_y

4 将上面的过程定义为一个KNNClassifier类

import numpy as np
from math import sqrt
from collections import Counter
 
class KNNClassifier:
 
    def __init__(self, k):
        """初始化kNN分类器"""
        assert k >= 1, "k must be valid"
        self.k = k
        self._X_train = None
        self._y_train = None
 
    def fit(self, X_train, y_train):
        """根据训练数据集X_train和y_train训练kNN分类器"""
        assert X_train.shape[0] == y_train.shape[0], \
            "the size of X_train must be equal to the size of y_train"
        assert self.k <= X_train.shape[0], \
            "the size of X_train must be at least k."
 
        self._X_train = X_train
        self._y_train = y_train
        return self
 
    def predict(self, X_predict):
        """给定待预测数据集X_predict，返回表示X_predict的结果向量"""
        assert self._X_train is not None and self._y_train is not None, \
                "must fit before predict!"
        assert X_predict.shape[1] == self._X_train.shape[1], \
                "the feature number of X_predict must be equal to X_train"
 
        y_predict = [self._predict(x) for x in X_predict]
        return np.array(y_predict)
 
    def _predict(self, x):
        """给定单个待预测数据x，返回x的预测结果值"""
        assert x.shape[0] == self._X_train.shape[1], \
            "the feature number of x must be equal to X_train"
 
        distances = [sqrt(np.sum((x_train - x) ** 2))
                     for x_train in self._X_train]
        nearest = np.argsort(distances)
 
        topK_y = [self._y_train[i] for i in nearest[:self.k]]
        votes = Counter(topK_y)
 
        return votes.most_common(1)[0][0]
    def _accuracy_score(self,y_true, y_predict):
        """计算y_true和y_predict之间的准确率"""
        assert len(y_true) == len(y_predict), \
            "the size of y_true must be equal to the size of y_predict"

        return np.sum(y_true == y_predict) / len(y_true)
    def score(self, X_test, y_test):
        """根据测试数据集 X_test 和 y_test 确定当前模型的准确度"""
 
        y_predict = self.predict(X_test)
        return self._accuracy_score(y_test, y_predict)
 
    def __repr__(self):
        return "KNN(k=%d)" % self.k

knn_clf = KNNClassifier(6)
knn_clf.fit(X_train,y_train)
# predict的参数要求是一个2维矩阵
x=x.reshape(1,-1)
knn_clf.predict(x)[0]

x.shape

(1, 2)

二案例2 玩转鸢尾花

2.1 在IRIS上实现KNN算法

Iris 鸢尾花数据集是一个经典数据集，在统计学习和机器学习领域都经常被用作示例。
$\quad$ 数据集内包含 3 类共 150 条记录，每类各 50 个数据，
$\quad$ 每条记录都有 4 项特征：花萼长度、花萼宽度、花瓣长度、花瓣宽度，
可以通过这4个特征预测鸢尾花卉属于（iris-setosa, iris-versicolour, iris-virginica）中的哪一品种。
$\quad$ setosa是山鸾尾，versicolour是变色鸾尾（或彩色鸾尾），virginica是维吉尼亚鸾尾。
$\quad$ 下面3幅图分别是setosa，versicolour，virginica。

iris1

sepal1

from sklearn import datasets
iris = datasets.load_iris()

dir(iris)

['DESCR', 'data', 'feature_names', 'filename', 'target', 'target_names']

iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

    ============== ==== ==== ======= ===== ====================
                    Min  Max   Mean    SD   Class Correlation
    ============== ==== ==== ======= ===== ====================
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)
    ============== ==== ==== ======= ===== ====================

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :Date: July, 1988

The famous Iris database, first used by Sir R.A. Fisher. The dataset is taken
from Fisher's paper. Note that it's the same as in R, but not as in the UCI
Machine Learning Repository, which has two wrong data points.

This is perhaps the best known database to be found in the
pattern recognition literature.  Fisher's paper is a classic in the field and
is referenced frequently to this day.  (See Duda & Hart, for example.)  The
data set contains 3 classes of 50 instances each, where each class refers to a
type of iris plant.  One class is linearly separable from the other 2; the
latter are NOT linearly separable from each other.

.. topic:: References

   - Fisher, R.A. "The use of multiple measurements in taxonomic problems"
     Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to
     Mathematical Statistics" (John Wiley, NY, 1950).
   - Duda, R.O., & Hart, P.E. (1973) Pattern Classification and Scene Analysis.
     (Q327.D83) John Wiley & Sons.  ISBN 0-471-22361-1.  See page 218.
   - Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System
     Structure and Classification Rule for Recognition in Partially Exposed
     Environments".  IEEE Transactions on Pattern Analysis and Machine
     Intelligence, Vol. PAMI-2, No. 1, 67-71.
   - Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule".  IEEE Transactions
     on Information Theory, May 1972, 431-433.
   - See also: 1988 MLC Proceedings, 54-64.  Cheeseman et al"s AUTOCLASS II
     conceptual clustering system finds 3 classes in the data.
   - Many, many more ...

# x=iris.data
# y=iris.target
import matplotlib.pyplot as plt
idx=np.array([[0,1],[0,2],[0,3],[1,2],[1,3],[2,3]])
fig=plt.figure(figsize=(15,20))
x=iris.data
y=iris.target
for i in range(1,7):
    ax=fig.add_subplot(3,2,i)
    p1=ax.scatter(x[y==0,idx[i-1,0]],x[y==0,idx[i-1,1]],color='red')#取特征第0,1列，绘制类别为0的，颜色red
    p2=ax.scatter(x[y==1,idx[i-1,0]],x[y==1,idx[i-1,1]],color='blue')#取特征第0,1列，绘制类别为1的，颜色blue
    p3=ax.scatter(x[y==2,idx[i-1,0]],x[y==2,idx[i-1,1]],color='green')#取特征第0,1列，绘制类别为2的，颜色green
    plt.legend([p1,p2,p3],['setosa','versicolor','virginica'],loc='upper left',fontsize=15)
    titles=np.array(['花瓣长度和花瓣宽度','花瓣长度和花萼长度','花瓣长度和花萼宽度',
                    '花瓣宽度和花萼长度','花瓣宽度和花萼宽度','花萼长度和花萼宽度'])
    ax.set_title(titles[i-1],fontsize=15)
    labels=np.array(['花瓣长度','花瓣宽度','花萼长度','花萼宽度'])
    ax.set_xlabel(labels[idx[i-1]][0],fontsize=15)
    ax.set_ylabel(labels[idx[i-1]][1],fontsize=15)
plt.show()

png

iris.data[0]

array([5.1, 3.5, 1.4, 0.2])

knn_clf = KNNClassifier(6)
knn_clf.fit(iris.data,iris.target)
score=knn_clf.score(iris.data,iris.target)
print('score=',score)
# predict的参数要求是一个2维矩阵
x=np.array([[5.0,3.2,1.6,0.1]])
knn_clf.predict(x)[0]

score= 0.98





0

2.2 训练集和测试集的拆分(train_test_split)

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target
X.shape,y.shape

((150, 4), (150,))

train_test_split

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

对y进行shuffle,同时保持X,y的对应关系

# permutation 得到0-150的一个排列
shuffled_indexes = np.random.permutation(len(X))
shuffled_indexes

array([ 95, 106, 115, 148,  96,  21,  26,  22,  28,  71,  25,  79,  17,
        14, 119, 124,  62, 144,  85,  83, 133, 108,   6,  56,  59,  73,
        47,  89,  97,  36, 126,  37,  52,  91,  19, 112, 116,  66, 142,
       146, 135, 141,  23, 143,  44, 132, 137,  42,  69, 125, 149,  63,
        31,  46,  74,  10,  72, 128,  98,  75,  40, 100,  45, 145,  11,
        77,  20,  15,  88, 131,  12,  57,   0,  99,  67,   2, 113, 107,
       118,  38,  93,  50,  54,  82,  81, 109, 123, 114,  65,  87, 147,
         9, 140, 110, 121,   8,  78,  61,  27,  24,  68,  80,  70, 101,
         4,   1,  64,  90, 129,  18,   3,  94,  33,  48,  76,  53,  32,
       130, 105,  60,  13,  49, 139, 117,  16, 120, 134, 127,  34,  29,
       136, 104,  84,   7, 102, 138,  35, 122, 111,  30,  86, 103,  58,
        92,  43,  51,  39,  41,  55,   5])

# 选取测试数据集的比例
test_ratio = 0.2
test_size = int(len(X)*test_ratio)
test_size

test_indexes = shuffled_indexes[:test_size]
train_indexes = shuffled_indexes[test_size:]
X_train,y_train = X[train_indexes],y[train_indexes]
X_test,y_test = X[test_indexes],y[test_indexes]
print(X_train.shape,y_train.shape,X_test.shape,y_test.shape)

(120, 4) (120,) (30, 4) (30,)

def train_test_split(X,y,test_ratio=0.2,seed=None):
    assert X.shape[0] == y.shape[0], \
        "the size of X must be equal to the size of y"
    assert 0.0 <= test_ratio <= 1.0, \
        'test_ratio must be valid'
    if seed:
        np.random.seed(seed)
        
    shuffled_indexes = np.random.permutation(len(X))
    
    test_size = int(len(X)*test_ratio)
    test_indexes = shuffled_indexes[:test_size]
    train_indexes = shuffled_indexes[test_size:]
    
    X_train,y_train = X[train_indexes],y[train_indexes]
    X_test,y_test = X[test_indexes],y[test_indexes]
    return X_train,y_train,X_test,y_test

knn_clf = KNNClassifier(3)
X_train,y_train,X_test,y_test = train_test_split(iris.data,iris.target,0.2)
knn_clf.fit(X_train,y_train)
score=knn_clf.score(X_test,y_test)
print('score=',score)
# 或者
y_predict = knn_clf.predict(X_test)
score = sum(y_predict==y_test)/len(y_test)
print('score=',score)
# predict的参数要求是一个2维矩阵
x=np.array([[5.0,3.2,1.6,0.1]])
knn_clf.predict(x)[0]

score= 0.9666666666666667
score= 0.9666666666666667





0

# sklearn中也有相应的函数train_test_split
from sklearn.model_selection import train_test_split

2.3 超参数的网格搜索

①寻找最好的k

from sklearn.neighbors import KNeighborsClassifier
best_score = 0.0
best_k = -1
for k in range(1,11):
    knn_clf = KNeighborsClassifier(n_neighbors=k)
    knn_clf.fit(X_train,y_train)
    score = knn_clf.score(X_test,y_test)
    if score > best_score:
        best_score = score
        best_k = k
print('best_k =',best_k)

best_k = 1

② 寻找距离函数
参考https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier

The default value, weights = 'uniform', assigns uniform weights to each neighbor. weights = 'distance' assigns weights proportional to the inverse of the distance from the query point. Alternatively, a user-defined function of the distance can be supplied to compute the weights.

best_weight = ''
best_score = 0.0
best_k = -1
for weight in ['uniform','distance']:
    for k in range(1,11):
        knn_clf = KNeighborsClassifier(n_neighbors=k,weights=weight)
        knn_clf.fit(X_train,y_train)
        score = knn_clf.score(X_test,y_test)
        if score > best_score:
            best_score = score
            best_k = k
            best_weight = weight

print('best_k =',best_k)
print('best_score =',best_score)
print('best_weight =',best_weight)

best_k = 1
best_score = 0.9666666666666667
best_weight = uniform

③搜索明可夫斯基距离中相应的p

%%time
best_p = -1
best_score = 0.0
best_k = -1
for k in range(1,11):
    for p in range(1,6):
        knn_clf = KNeighborsClassifier(n_neighbors=k,weights='distance',p=p)
        knn_clf.fit(X_train,y_train)
        score = knn_clf.score(X_test,y_test)
        if score > best_score:
            best_k = k
            best_score = score
            best_p = p
        
print('best_k =',best_k)
print('best_p =',best_p)
print('best_score =',best_score)

best_k = 1
best_p = 1
best_score = 0.9666666666666667
Wall time: 224 ms

2.4 sklearn中的Grid Search(网格搜索)

param_grid = [
    {
        'weights':['distance'],
        'n_neighbors':[i for i in range(1,11)],
        'metric':['euclidean','chebyshev','minkowski'],
        'p':[i for i in range(1,6)]
    },
    {
        'weights':['uniform'],
        'n_neighbors':[i for i in range(1,11)]
    }
]

knn_clf = KNeighborsClassifier()
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(knn_clf,param_grid,n_jobs=-1,verbose=2)

%%time
grid_search.fit(X_train,y_train)

Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    0.5s


Wall time: 2.46 s


[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:    2.3s finished





GridSearchCV(cv=None, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid=[{'metric': ['euclidean', 'chebyshev', 'minkowski'],
                          'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                          'p': [1, 2, 3, 4, 5], 'weights': ['distance']},
                         {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                          'weights': ['uniform']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

grid_search.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='distance')

grid_search.best_score_

0.9666666666666668

grid_search.best_params_

{'metric': 'euclidean', 'n_neighbors': 3, 'p': 1, 'weights': 'distance'}

knn_clf = grid_search.best_estimator_

knn_clf.score(X_test,y_test)

0.9666666666666667

三数据归一化

3.1 最值归一化 Normalization

x = np.random.randint(0,100,size=100)
x

array([77, 64, 74,  4, 46, 16, 49, 69, 93,  9, 85, 90, 18, 42, 67, 58, 89,
       92, 66, 91, 50, 53, 15, 78, 12, 28, 68, 72, 96, 79, 78,  9, 81, 42,
       60, 65, 43, 19, 45, 32,  7, 94, 82,  6, 57, 74, 71, 41, 47,  3,  7,
       36, 66, 73, 63, 28, 78, 40, 90,  9, 92,  3, 42, 67, 18, 53, 84, 34,
       76, 67, 74, 34, 32,  4, 73, 24, 81, 99,  7, 72,  9, 51, 59, 15, 24,
       99,  0, 23, 52, 63,  3, 27, 95, 60, 16, 56, 67, 63, 62, 40])

(x-np.min(x))/(np.max(x)-np.min(x))

array([0.77777778, 0.64646465, 0.74747475, 0.04040404, 0.46464646,
       0.16161616, 0.49494949, 0.6969697 , 0.93939394, 0.09090909,
       0.85858586, 0.90909091, 0.18181818, 0.42424242, 0.67676768,
       0.58585859, 0.8989899 , 0.92929293, 0.66666667, 0.91919192,
       0.50505051, 0.53535354, 0.15151515, 0.78787879, 0.12121212,
       0.28282828, 0.68686869, 0.72727273, 0.96969697, 0.7979798 ,
       0.78787879, 0.09090909, 0.81818182, 0.42424242, 0.60606061,
       0.65656566, 0.43434343, 0.19191919, 0.45454545, 0.32323232,
       0.07070707, 0.94949495, 0.82828283, 0.06060606, 0.57575758,
       0.74747475, 0.71717172, 0.41414141, 0.47474747, 0.03030303,
       0.07070707, 0.36363636, 0.66666667, 0.73737374, 0.63636364,
       0.28282828, 0.78787879, 0.4040404 , 0.90909091, 0.09090909,
       0.92929293, 0.03030303, 0.42424242, 0.67676768, 0.18181818,
       0.53535354, 0.84848485, 0.34343434, 0.76767677, 0.67676768,
       0.74747475, 0.34343434, 0.32323232, 0.04040404, 0.73737374,
       0.24242424, 0.81818182, 1.        , 0.07070707, 0.72727273,
       0.09090909, 0.51515152, 0.5959596 , 0.15151515, 0.24242424,
       1.        , 0.        , 0.23232323, 0.52525253, 0.63636364,
       0.03030303, 0.27272727, 0.95959596, 0.60606061, 0.16161616,
       0.56565657, 0.67676768, 0.63636364, 0.62626263, 0.4040404 ])

X = np.random.randint(0,100,(50,3))
X[:10,:]

array([[12, 58, 95],
       [44, 60, 37],
       [37, 50, 11],
       [99, 62, 32],
       [20, 42, 84],
       [ 3, 29, 73],
       [37,  2, 61],
       [46, 29, 18],
       [77, 52, 71],
       [ 0, 20, 62]])

X = np.array(X,dtype=np.float)
X[:10,:]

array([[12., 58., 95.],
       [44., 60., 37.],
       [37., 50., 11.],
       [99., 62., 32.],
       [20., 42., 84.],
       [ 3., 29., 73.],
       [37.,  2., 61.],
       [46., 29., 18.],
       [77., 52., 71.],
       [ 0., 20., 62.]])

对于X来说，一般X的行数表示样本的个数，X的列数表示的是特征数(feature_num),
所以，我们对X的每一列分别进行归一化

for i in range(X.shape[1]):
    X[:,i] = (X[:,i]-np.min(X[:,i]))/(np.max(X[:,i])-np.min(X[:,i]))
X[:10,:]

array([[0.12121212, 0.61538462, 0.96842105],
       [0.44444444, 0.63736264, 0.35789474],
       [0.37373737, 0.52747253, 0.08421053],
       [1.        , 0.65934066, 0.30526316],
       [0.2020202 , 0.43956044, 0.85263158],
       [0.03030303, 0.2967033 , 0.73684211],
       [0.37373737, 0.        , 0.61052632],
       [0.46464646, 0.2967033 , 0.15789474],
       [0.77777778, 0.54945055, 0.71578947],
       [0.        , 0.1978022 , 0.62105263]])

在下面的散点图中，我们看到所有归一化后的数据都在0-1之间

plt.scatter(X[:,0],X[:,1])

<matplotlib.collections.PathCollection at 0x1ec4bf197b8>

png

3.2 均值方差归一化

from sklearn import datasets
iris = datasets.load_iris()
iris.data.shape

(150, 4)

X=iris.data
for i in range(X.shape[1]):
    X[:,i] = (X[:,i]-np.mean(X[:,i]))/(np.std(X[:,i]))
X[:10,:]

array([[-0.90068117,  1.01900435, -1.34022653, -1.3154443 ],
       [-1.14301691, -0.13197948, -1.34022653, -1.3154443 ],
       [-1.38535265,  0.32841405, -1.39706395, -1.3154443 ],
       [-1.50652052,  0.09821729, -1.2833891 , -1.3154443 ],
       [-1.02184904,  1.24920112, -1.34022653, -1.3154443 ],
       [-0.53717756,  1.93979142, -1.16971425, -1.05217993],
       [-1.50652052,  0.78880759, -1.34022653, -1.18381211],
       [-1.02184904,  0.78880759, -1.2833891 , -1.3154443 ],
       [-1.74885626, -0.36217625, -1.34022653, -1.3154443 ],
       [-1.14301691,  0.09821729, -1.2833891 , -1.44707648]])

在均值方差归一化后，数据不再只在0-1之间。

plt.scatter(X[:,0],X[:,1])

<matplotlib.collections.PathCollection at 0x1ec4bf28f98>

png

np.mean(X[:,0]),np.std(X[:,0])

(-4.736951571734001e-16, 1.0)

3.3 sklearn中的归一化方法

from sklearn.preprocessing import StandardScaler
standardScaler = StandardScaler()

standardScaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

standardScaler.mean_

array([5.84416667, 3.07166667, 3.76916667, 1.19916667])

standardScaler.scale_

array([0.82833124, 0.42390709, 1.77940982, 0.77540267])

X_train_standard = standardScaler.transform(X_train)
X_test_standard = standardScaler.transform(X_test)

knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_train_standard,y_train)
knn_clf.score(X_test_standard,y_test)

0.9333333333333333

附：图像像素值0-255到0-1的相互转换

def trans(img):
    re = np.zeros(img.shape)
    for i in range(img.shape[2]):
        xmax=max(map(max,img[:,:,i])) #所有数据中最大的
        xmin=min(map(min,img[:,:,i])) #所有数据中最小的
        ymax=1 # 要归一的范围的最大值
        ymin=0   # 要归一的范围的最小值
        for j in range(img.shape[0]):
            for k in range(img.shape[1]):
                re[j,k,i]=(ymax-ymin)*(img[j,k,i]-xmin)/(xmax-xmin)+ymin # 即re[j,k,i]=(img[j,k,i]-xmin)/(xmax-xmin)
    return re
def trans_reverse(img):
    re=np.zeros(img.shape)
    for i in range(img.shape[2]):
        xmax=max(map(max,img[:,:,i])) #所有数据中最大的
        xmin=min(map(min,img[:,:,i])) #所有数据中最小的
        ymax=255 # 要归一的范围的最大值
        ymin=0   # 要归一的范围的最小值
        for j in range(img.shape[0]):
            for k in range(img.shape[1]):
                re[j,k,i]=round((ymax-ymin)*(img[j,k,i]-xmin)/(xmax-xmin)+ymin)
                # 即 re[j,k,i]=round(255 * (img[j,k,i]-xmin)/(xmax-xmin) + 0)
    return re.astype(np.int32)

import matplotlib.pyplot as plt
import numpy as np
im1 = plt.imread('f:/1/fj1.jpg')
print(im1.shape)
im1_trans=trans(im1)
im1_trans_reverse=trans_reverse(im1_trans)
im1_trans_reverse

(313, 500, 3)





array([[[178, 177, 172],
        [176, 175, 170],
        [175, 174, 169],
        ...,
        [156, 153, 146],
        [156, 153, 146],
        [157, 154, 147]],

       [[176, 175, 170],
        [175, 174, 169],
        [173, 172, 167],
        ...,
        [152, 149, 142],
        [153, 150, 143],
        [154, 151, 144]],

       [[175, 174, 169],
        [173, 172, 167],
        [171, 170, 165],
        ...,
        [150, 147, 140],
        [151, 148, 141],
        [152, 149, 142]],

       ...,

       [[231, 227, 224],
        [230, 226, 223],
        [232, 228, 225],
        ...,
        [153, 150, 145],
        [155, 152, 147],
        [166, 163, 158]],

       [[171, 167, 164],
        [201, 197, 194],
        [218, 214, 211],
        ...,
        [199, 195, 192],
        [198, 194, 191],
        [204, 200, 197]],

       [[220, 216, 213],
        [235, 231, 228],
        [217, 213, 210],
        ...,
        [197, 193, 190],
        [203, 199, 196],
        [217, 213, 210]]])

plt.figure(figsize=(10,30))
plt.subplot(131)
plt.imshow(im1)
plt.subplot(132)
plt.imshow(im1_trans)
plt.subplot(133)
plt.imshow(im1_trans_reverse)

<matplotlib.image.AxesImage at 0x220ca303358>

png