KNN算法

Stannis

于 2021-10-26 21:46:51 发布

阅读量122

点赞数

分类专栏：数据分析 python 文章标签：算法 python 开发语言

本文链接：https://blog.csdn.net/Stannis/article/details/120962416

版权

数据分析同时被 2 个专栏收录

16 篇文章 1 订阅

订阅专栏

python

8 篇文章 0 订阅

订阅专栏

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

x_data = np.array([[118,20],[20,56],[99,78],[56,34],[90,10],[88,45]])
x_data

array([[118,  20],
       [ 20,  56],
       [ 99,  78],
       [ 56,  34],
       [ 90,  10],
       [ 88,  45]])

y_data =  np.array(['A','A','A','B','B','B'])
y_data

array(['A', 'A', 'A', 'B', 'B', 'B'], dtype='<U1')

# 测试数据
x_test  = np.array([30,20])
x_test

array([30, 20])

plt.scatter(x_data[:,0][0:3],x_data[:,1][:3],c='b',label='A类')
plt.scatter(x_data[:,0][3:],x_data[:,1][3:],c='g',label='B类')
plt.scatter(x_test[0],x_test[1],c='hotpink',label='测试数据')
plt.show()

在这里插入图片描述

欧氏距离

x_data

array([[118,  20],
       [ 20,  56],
       [ 99,  78],
       [ 56,  34],
       [ 90,  10],
       [ 88,  45]])

diffMat = (x_test - x_data) ** 2
diffMat

array([[7744,    0],
       [ 100, 1296],
       [4761, 3364],
       [ 676,  196],
       [3600,  100],
       [3364,  625]], dtype=int32)

diffMat.sum(axis=1)

array([7744, 1396, 8125,  872, 3700, 3989], dtype=int32)

distance = np.sqrt(diffMat.sum(axis=1))
distance

array([88.        , 37.36308338, 90.13878189, 29.52964612, 60.8276253 ,
       63.1585307 ])

sort_distance = distance.argsort()
sort_distance

array([3, 1, 4, 5, 0, 2], dtype=int64)

第四个数是最近

# 距离最近的五个样本
classCount = []
k = 5
for i in range(k):
    votelabel = y_data[sort_distance[i]]
    classCount.append(votelabel)

classCount

['B', 'A', 'B', 'B', 'A']

求最多类别

from collections import Counter

y_test = Counter(classCount).most_common(1)[0][0]
y_test

'B'

sklearn实现KNN算法

from sklearn.neighbors  import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split

df = pd.read_csv(r'E:\telecom_churn.csv',
  dtype={'subscriberID':str},float_precision=None,skipinitialspace=True)

df.head()

	subscriberID	churn	gender	AGE	edu_class	incomeCode	duration	peakMinAv	peakMinDiff	negTrend	nrProm	prom	curPlan	avgplan	planChange	posPlanChange	call_10086
0	19164958.000000	1.0	0.0	20.0	2.0	12.0	16.0	113.666667	-8.0	1.0	0.0	0.0	1.0	1.0	0.0	0.0	0.0
1	39244924.000000	1.0	1.0	20.0	0.0	21.0	5.0	274.000000	-371.0	1.0	2.0	1.0	3.0	2.0	2.0	1.0	1.0
2	39578413.000000	1.0	0.0	11.0	1.0	47.0	3.0	392.000000	-784.0	1.0	0.0	0.0	3.0	3.0	0.0	0.0	1.0
3	40992265.000000	1.0	0.0	43.0	0.0	4.0	12.0	31.000000	-76.0	1.0	2.0	1.0	3.0	3.0	0.0	0.0	1.0
4	43061957.000000	1.0	1.0	60.0	0.0	9.0	14.0	129.333333	-334.0	1.0	0.0	0.0	3.0	3.0	0.0	0.0	0.0

df.shape

(3463, 20)

# 连续型变量
var = ['AGE','duration','peakMinAv','peakMinDiff','nrProm','call_10086']

#自变量
x = df[var]
x.head()

	AGE	duration	peakMinAv	peakMinDiff	nrProm	call_10086
0	20.0	16.0	113.666667	-8.0	0.0	0.0
1	20.0	5.0	274.000000	-371.0	2.0	1.0
2	11.0	3.0	392.000000	-784.0	0.0	1.0
3	43.0	12.0	31.000000	-76.0	2.0	1.0
4	60.0	14.0	129.333333	-334.0	0.0	0.0

# 标签
y = df['churn']
y.head()

0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
Name: churn, dtype: float64

x_train,x_test,y_train,y_test =  train_test_split(x,y,test_size=0.3,random_state=1234)

# 初始化模型
model = KNeighborsClassifier(n_neighbors=5)

# 训练或者拟合
model.fit(x_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

prediction = model.predict(x_test)

prediction

array([1., 0., 0., ..., 1., 0., 1.])

metrics.accuracy_score(y_test,prediction)

0.7064485081809432

# 确定K值
K = np.arange(3,100)
K

array([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
       20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
       37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
       54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
       71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
       88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])

# 对每一个k做模型
accuracy = []
for k in K:
    # 初始化模型
    model = KNeighborsClassifier(n_neighbors=k)
    # 训练或者拟合
    model.fit(x_train,y_train)
    prediction = model.predict(x_test)
    accuracy.append(metrics.accuracy_score(y_test,prediction))

plt.plot(K,accuracy)
plt.show()

在这里插入图片描述

#获取最大值
argmax = np.array(accuracy).argmax()

argmax

#最佳K值
K[argmax]

accuracy[13]

0.7189605389797883

Stannis

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
KNN算法

import numpy as npimport pandas as pdimport matplotlib.pyplot as pltx_data = np.array([[118,20],[20,56],[99,78],[56,34],[90,10],[88,45]])x_dataarray([[118, 20], [ 20, 56], [ 99, 78], [ 56, 34], [ 90, 10], [ 88
复制链接

扫一扫