import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
x_data = np.array([[118,20],[20,56],[99,78],[56,34],[90,10],[88,45]])
x_data
array([[118, 20],
[ 20, 56],
[ 99, 78],
[ 56, 34],
[ 90, 10],
[ 88, 45]])
y_data = np.array(['A','A','A','B','B','B'])
y_data
array(['A', 'A', 'A', 'B', 'B', 'B'], dtype='<U1')
# 测试数据
x_test = np.array([30,20])
x_test
array([30, 20])
plt.scatter(x_data[:,0][0:3],x_data[:,1][:3],c='b',label='A类')
plt.scatter(x_data[:,0][3:],x_data[:,1][3:],c='g',label='B类')
plt.scatter(x_test[0],x_test[1],c='hotpink',label='测试数据')
plt.show()
欧氏距离
x_data
array([[118, 20],
[ 20, 56],
[ 99, 78],
[ 56, 34],
[ 90, 10],
[ 88, 45]])
diffMat = (x_test - x_data) ** 2
diffMat
array([[7744, 0],
[ 100, 1296],
[4761, 3364],
[ 676, 196],
[3600, 100],
[3364, 625]], dtype=int32)
diffMat.sum(axis=1)
array([7744, 1396, 8125, 872, 3700, 3989], dtype=int32)
distance = np.sqrt(diffMat.sum(axis=1))
distance
array([88. , 37.36308338, 90.13878189, 29.52964612, 60.8276253 ,
63.1585307 ])
sort_distance = distance.argsort()
sort_distance
array([3, 1, 4, 5, 0, 2], dtype=int64)
第四个数是最近
# 距离最近的五个样本
classCount = []
k = 5
for i in range(k):
votelabel = y_data[sort_distance[i]]
classCount.append(votelabel)
classCount
['B', 'A', 'B', 'B', 'A']
求最多类别
from collections import Counter
y_test = Counter(classCount).most_common(1)[0][0]
y_test
'B'
sklearn实现KNN算法
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
df = pd.read_csv(r'E:\telecom_churn.csv',
dtype={'subscriberID':str},float_precision=None,skipinitialspace=True)
df.head()
subscriberID | churn | gender | AGE | edu_class | incomeCode | duration | feton | peakMinAv | peakMinDiff | posTrend | negTrend | nrProm | prom | curPlan | avgplan | planChange | posPlanChange | negPlanChange | call_10086 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 19164958.000000 | 1.0 | 0.0 | 20.0 | 2.0 | 12.0 | 16.0 | 0.0 | 113.666667 | -8.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 39244924.000000 | 1.0 | 1.0 | 20.0 | 0.0 | 21.0 | 5.0 | 0.0 | 274.000000 | -371.0 | 0.0 | 1.0 | 2.0 | 1.0 | 3.0 | 2.0 | 2.0 | 1.0 | 0.0 | 1.0 |
2 | 39578413.000000 | 1.0 | 0.0 | 11.0 | 1.0 | 47.0 | 3.0 | 0.0 | 392.000000 | -784.0 | 0.0 | 1.0 | 0.0 | 0.0 | 3.0 | 3.0 | 0.0 | 0.0 | 0.0 | 1.0 |
3 | 40992265.000000 | 1.0 | 0.0 | 43.0 | 0.0 | 4.0 | 12.0 | 0.0 | 31.000000 | -76.0 | 0.0 | 1.0 | 2.0 | 1.0 | 3.0 | 3.0 | 0.0 | 0.0 | 0.0 | 1.0 |
4 | 43061957.000000 | 1.0 | 1.0 | 60.0 | 0.0 | 9.0 | 14.0 | 0.0 | 129.333333 | -334.0 | 0.0 | 1.0 | 0.0 | 0.0 | 3.0 | 3.0 | 0.0 | 0.0 | 0.0 | 0.0 |
df.shape
(3463, 20)
# 连续型变量
var = ['AGE','duration','peakMinAv','peakMinDiff','nrProm','call_10086']
#自变量
x = df[var]
x.head()
AGE | duration | peakMinAv | peakMinDiff | nrProm | call_10086 | |
---|---|---|---|---|---|---|
0 | 20.0 | 16.0 | 113.666667 | -8.0 | 0.0 | 0.0 |
1 | 20.0 | 5.0 | 274.000000 | -371.0 | 2.0 | 1.0 |
2 | 11.0 | 3.0 | 392.000000 | -784.0 | 0.0 | 1.0 |
3 | 43.0 | 12.0 | 31.000000 | -76.0 | 2.0 | 1.0 |
4 | 60.0 | 14.0 | 129.333333 | -334.0 | 0.0 | 0.0 |
# 标签
y = df['churn']
y.head()
0 1.0
1 1.0
2 1.0
3 1.0
4 1.0
Name: churn, dtype: float64
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=1234)
# 初始化模型
model = KNeighborsClassifier(n_neighbors=5)
# 训练或者拟合
model.fit(x_train,y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=5, p=2,
weights='uniform')
prediction = model.predict(x_test)
prediction
array([1., 0., 0., ..., 1., 0., 1.])
metrics.accuracy_score(y_test,prediction)
0.7064485081809432
# 确定K值
K = np.arange(3,100)
K
array([ 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])
# 对每一个k做模型
accuracy = []
for k in K:
# 初始化模型
model = KNeighborsClassifier(n_neighbors=k)
# 训练或者拟合
model.fit(x_train,y_train)
prediction = model.predict(x_test)
accuracy.append(metrics.accuracy_score(y_test,prediction))
plt.plot(K,accuracy)
plt.show()
#获取最大值
argmax = np.array(accuracy).argmax()
argmax
13
#最佳K值
K[argmax]
16
accuracy[13]
0.7189605389797883