1. 导入鸢尾花数据集
import numpy as np
import pandas as pd
from sklearn import datasets
data=pd.read_csv("data/iris.csv",header=0)
data.head()
data.sample(10)
data["Name"]=data["Name"].map({"Iris-versicolor":0,
"Iris-virginica":1,
"Iris-setosa":2})
data.duplicated()
data.duplicated().any()
data.drop_duplicates(inplace=True)
len(data)
data["Name"].value_counts()
0 50
1 49
2 48
Name: Name, dtype: int64
x=np.array([1,2,3])
y=np.array([[4,5,6],
[7,8,9]])
x-y
np.sum(x-y)
sum(x-y)
array([-9, -9, -9])
2. KNN分类
class KNN:
def __init__(self,k):
"""初始化 """
self.k=k
def fit(self,X,y):
"""训练方法
X:类数组类型(二维) 待训练的样本特征(属性)
y:(一维) 样本的目标值(标签)"""
self.X=np.asarray(X)
self.y=np.asarray(y)
def predict(self,X):
X=np.asarray(X)
result=[]
for x in X:
dis=np.sqrt(np.sum((x-self.X)**2,axis=1))
index=dis.argsort()
index=index[:self.k]
count=np.bincount(self.y[index])
result.append(count.argmax())
return np.asarray(result)
def predict2(self,X):
X=np.asarray(X)
result=[]
for x in X:
dis=np.sqrt(np.sum((x-self.X)**2,axis=1))
index=dis.argsort()
index=index[:self.k]
count=np.bincount(self.y[index],weights=1/dis[index])
result.append(count.argmax())
return np.asarray(result)
3. train_test_split
t0=data[data["Name"]==0]
t1=data[data["Name"]==1]
t2=data[data["Name"]==2]
t0=t0.sample(len(t0),random_state=666)
t1=t1.sample(len(t1),random_state=666)
t2=t2.sample(len(t2),random_state=666)
train_X=pd.concat([t0.iloc[:40,:-1],t1.iloc[:40,:-1],t2.iloc[:40,:-1]],axis=0)
train_y=pd.concat([t0.iloc[:40,-1],t1.iloc[:40,-1],t2.iloc[:40,-1]],axis=0)
test_X=pd.concat([t0.iloc[40:,:-1],t1.iloc[40:,:-1],t2.iloc[40:,:-1]],axis=0)
test_y=pd.concat([t0.iloc[40:,-1],t1.iloc[40:,-1],t2.iloc[40:,-1]],axis=0)
my_knn=KNN(k=3)
my_knn.fit(train_X,train_y)
result=my_knn.predict(test_X)
display(result)
display(test_y)
sum(result==test_y)/len(test_y)
array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 2, 2, 2,
2, 2, 2, 2, 2], dtype=int64)
93 0
77 0
86 0
96 0
59 0
56 0
80 0
95 0
52 0
94 0
127 1
136 1
147 1
109 1
106 1
130 1
146 1
102 1
145 1
38 2
49 2
9 2
6 2
30 2
47 2
2 2
46 2
Name: Name, dtype: int64
0.9259259259259259
result2=my_knn.predict2(test_X)
display(np.sum(result2==test_y))
display(sum(result2==test_y)/len(test_y))
25
0.9259259259259259
4. 可视化显示
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams["font.family"]="SimHei"
mpl.rcParams["axes.unicode_minus"]=False
data.sample(10)
| SepalLength | SepalWidth | PetalLength | PetalWidth | Name |
---|
48 | 5.3 | 3.7 | 1.5 | 0.2 | 2 |
---|
76 | 6.8 | 2.8 | 4.8 | 1.4 | 0 |
---|
115 | 6.4 | 3.2 | 5.3 | 2.3 | 1 |
---|
35 | 5.0 | 3.2 | 1.2 | 0.2 | 2 |
---|
16 | 5.4 | 3.9 | 1.3 | 0.4 | 2 |
---|
13 | 4.3 | 3.0 | 1.1 | 0.1 | 2 |
---|
130 | 7.4 | 2.8 | 6.1 | 1.9 | 1 |
---|
135 | 7.7 | 3.0 | 6.1 | 2.3 | 1 |
---|
20 | 5.4 | 3.4 | 1.7 | 0.2 | 2 |
---|
45 | 4.8 | 3.0 | 1.4 | 0.3 | 2 |
---|
plt.figure(figsize=(20,10))
plt.scatter(x=t0["SepalLength"][:40],y=t0["PetalLength"][:40],color='r',label="Iris-versicolor")
plt.scatter(x=t1["SepalLength"][:40],y=t1["PetalLength"][:40],color='b',label="Iris-virginica")
plt.scatter(x=t2["SepalLength"][:40],y=t2["PetalLength"][:40],color='g',label="Iris-setosa")
right=test_X[result==test_y]
wrong=test_X[result!=test_y]
plt.scatter(right["SepalLength"],right["PetalLength"],color='c',marker='x',label="right")
plt.scatter(wrong["SepalLength"],wrong["PetalLength"],color='m',marker='>',label="wrong")
plt.xlabel("花萼长度")
plt.ylabel("花瓣长度")
plt.title("KNN分类结果显示")
plt.legend(loc="best")
plt.show()
np.bincount([1,0,0,1,1],weights=[0.3,0.1,0.2,0.2,0.4])
array([0.3, 0.9])
np.bincount([1,0,0,1,1])
array([2, 3], dtype=int64)