KNN算法

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
x_data = np.array([[118,20],[20,56],[99,78],[56,34],[90,10],[88,45]])
x_data
array([[118,  20],
       [ 20,  56],
       [ 99,  78],
       [ 56,  34],
       [ 90,  10],
       [ 88,  45]])
y_data =  np.array(['A','A','A','B','B','B'])
y_data
array(['A', 'A', 'A', 'B', 'B', 'B'], dtype='<U1')
# 测试数据
x_test  = np.array([30,20])
x_test
array([30, 20])
plt.scatter(x_data[:,0][0:3],x_data[:,1][:3],c='b',label='A类')
plt.scatter(x_data[:,0][3:],x_data[:,1][3:],c='g',label='B类')
plt.scatter(x_test[0],x_test[1],c='hotpink',label='测试数据')
plt.show()

在这里插入图片描述

欧氏距离

x_data
array([[118,  20],
       [ 20,  56],
       [ 99,  78],
       [ 56,  34],
       [ 90,  10],
       [ 88,  45]])
diffMat = (x_test - x_data) ** 2
diffMat
array([[7744,    0],
       [ 100, 1296],
       [4761, 3364],
       [ 676,  196],
       [3600,  100],
       [3364,  625]], dtype=int32)
diffMat.sum(axis=1)
array([7744, 1396, 8125,  872, 3700, 3989], dtype=int32)
distance = np.sqrt(diffMat.sum(axis=1))
distance
array([88.        , 37.36308338, 90.13878189, 29.52964612, 60.8276253 ,
       63.1585307 ])
sort_distance = distance.argsort()
sort_distance
array([3, 1, 4, 5, 0, 2], dtype=int64)

第四个数是最近

# 距离最近的五个样本
classCount = []
k = 5
for i in range(k):
    votelabel = y_data[sort_distance[i]]
    classCount.append(votelabel)
classCount
['B', 'A', 'B', 'B', 'A']

求最多类别

from collections import Counter
y_test = Counter(classCount).most_common(1)[0][0]
y_test
'B'

sklearn实现KNN算法

from sklearn.neighbors  import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
df = pd.read_csv(r'E:\telecom_churn.csv',
  dtype={'subscriberID':str},float_precision=None,skipinitialspace=True)
df.head()
subscriberIDchurngenderAGEedu_classincomeCodedurationfetonpeakMinAvpeakMinDiffposTrendnegTrendnrPrompromcurPlanavgplanplanChangeposPlanChangenegPlanChangecall_10086
019164958.0000001.00.020.02.012.016.00.0113.666667-8.00.01.00.00.01.01.00.00.00.00.0
139244924.0000001.01.020.00.021.05.00.0274.000000-371.00.01.02.01.03.02.02.01.00.01.0
239578413.0000001.00.011.01.047.03.00.0392.000000-784.00.01.00.00.03.03.00.00.00.01.0
340992265.0000001.00.043.00.04.012.00.031.000000-76.00.01.02.01.03.03.00.00.00.01.0
443061957.0000001.01.060.00.09.014.00.0129.333333-334.00.01.00.00.03.03.00.00.00.00.0
df.shape
(3463, 20)
# 连续型变量
var = ['AGE','duration','peakMinAv','peakMinDiff','nrProm','call_10086']
#自变量
x = df[var]
x.head()
AGEdurationpeakMinAvpeakMinDiffnrPromcall_10086
020.016.0113.666667-8.00.00.0
120.05.0274.000000-371.02.01.0
211.03.0392.000000-784.00.01.0
343.012.031.000000-76.02.01.0
460.014.0129.333333-334.00.00.0
# 标签
y = df['churn']
y.head()
0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
Name: churn, dtype: float64
x_train,x_test,y_train,y_test =  train_test_split(x,y,test_size=0.3,random_state=1234)
# 初始化模型
model = KNeighborsClassifier(n_neighbors=5)
# 训练或者拟合
model.fit(x_train,y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
prediction = model.predict(x_test)
prediction
array([1., 0., 0., ..., 1., 0., 1.])
metrics.accuracy_score(y_test,prediction)
0.7064485081809432
# 确定K值
K = np.arange(3,100)
K
array([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
       20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
       37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
       54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
       71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
       88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])
# 对每一个k做模型
accuracy = []
for k in K:
    # 初始化模型
    model = KNeighborsClassifier(n_neighbors=k)
    # 训练或者拟合
    model.fit(x_train,y_train)
    prediction = model.predict(x_test)
    accuracy.append(metrics.accuracy_score(y_test,prediction))
plt.plot(K,accuracy)
plt.show()

在这里插入图片描述

#获取最大值
argmax = np.array(accuracy).argmax()
argmax
13
#最佳K值
K[argmax]
16
accuracy[13]
0.7189605389797883
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值