KNN鸢尾花分类
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
import numpy as np
X,y = datasets.load_iris(True) #返回x、y
X = X[:,:2]
plt.scatter(X[:,0],X[:,1],c=y)
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X,y)
x1 = np.linspace(4,8,100) #横坐标4到8
y1 = np.linspace(2,4.5,80) #纵坐标2到4.5
X1,Y1 = np.meshgrid(x1,y1)
X1 =X1.reshape(-1,1)
Y1 =Y1.reshape(-1,1)
X_test = np.concatenate([X1,Y1],axis = 1) #shape为(8000,2)
from matplotlib.colors import ListedColormap
lc1 = ListedColormap(["#FFAAAA","#AAFFAA","#AAAAFF"])
lc2 = ListedColormap(["#FF0000","#00FF00","#0000FF"])
y_ = knn.predict(X_test)
plt.scatter(X_test[:,0],X_test[:,1], c=y_ ,cmap=lc1)
plt.scatter(X[:,0],X[:,1], c=y,cmap=lc2)
KNN参数的筛选
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
from sklearn.model_selection import cross_val_score
X,y = datasets.load_iris(True)
knn = KNeighborsClassifier()
score = cross_val_score(knn,X,y,scoring="accuracy",cv=6)
print(score)
erros = []
for i in range(1,14): #150开根号
knn = KNeighborsClassifier(n_neighbors=i)
score = cross_val_score(knn,X,y,scoring="accuracy",cv=6).mean()
erros.append(1-score)
import matplotlib.pyplot as plt
plt.plot(np.arange(1,14),erros)
weights = ["uniform","distance"]
for w in weights:
knn = KNeighborsClassifier(n_neighbors = 12,weights = w)
print(cross_val_score(knn,X,y,scoring="accuracy",cv = 6).mean())
0.98
0.9733333333333333
result = {}
for k in range(1,14):
for w in weights:
knn = KNeighborsClassifier(n_neighbors = k,weights = w)
sm = cross_val_score(knn,X,y,scoring="accuracy",cv = 6).mean()
result[w + str(k)] = sm
result
{'uniform1': 0.96,
'distance1': 0.96,
'uniform2': 0.94,
'distance2': 0.96,
'uniform3': 0.9666666666666667,
'distance3': 0.9666666666666667,
'uniform4': 0.9666666666666667,
'distance4': 0.9666666666666667,
'uniform5': 0.9666666666666667,
'distance5': 0.9666666666666667,
'uniform6': 0.9666666666666667,
'distance6': 0.96,
'uniform7': 0.9733333333333333,
'distance7': 0.9733333333333333,
'uniform8': 0.9666666666666667,
'distance8': 0.9666666666666667,
'uniform9': 0.9733333333333333,
'distance9': 0.9733333333333333,
'uniform10': 0.96,
'distance10': 0.96,
'uniform11': 0.9733333333333333,
'distance11': 0.9733333333333333,
'uniform12': 0.98,
'distance12': 0.9733333333333333,
'uniform13': 0.9733333333333333,
'distance13': 0.9733333333333333}
np.array(list(result.values())).argmax()
list(result)[22]
22
'uniform12'
KNN癌症诊断
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
cancer = pd.read_csv("/Users/zhucan/Desktop/cancer.csv",sep = "\t")
cancer.drop("ID",axis = 1, inplace=True)
X = cancer.iloc[:,1:]
y = cancer["Diagnosis"]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
knn = KNeighborsClassifier()
params = {"n_neighbors":[i for i in range(1,30)],"weights":["distance","uniform"],"p":[1,2]}
gcv = GridSearchCV(knn,params,scoring = "accuracy",cv = 6)
gcv.fit(X_train,y_train)
gcv.best_estimator_
gcv.best_score_
gcv.best_params_
y_ = gcv.predict(X_test)
gcv.score(X_test,y_test) #此时的gcv就是gcv.best_estimator_
pd.crosstab(index = y_test,columns = y_,rownames=["True"],colnames=["Predict"])
#输出混淆矩阵
KNeighborsClassifier(n_neighbors=4, p=1, weights='distance')
0.9516666666666667
{'n_neighbors': 4, 'p': 1, 'weights': 'distance'}
0.9385964912280702
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
confusion_matrix(y_,y_test)
print(classification_report(y_test,y_,target_names = ["B","M"]))
78/(78+5) = 0.94 78/(78+2) = 0.97
29/(29+2) = 0.94 29/(29+5) = 0.85
找健康的比找生病的要强
KNN数据归一化操作
#归一化操作
X_norm1 = (X-X.min())/(X.max()-X.min())
X_train,X_test,y_train,y_test = train_test_split(X_norm1,y,test_size = 0.2)
knn = KNeighborsClassifier()
params = {"n_neighbors":[i for i in range(1,30)],"weights":["distance","uniform"],"p":[1,2]}
gcv = GridSearchCV(knn,params,scoring = "accuracy",cv = 6)
gcv.fit(X_train,y_train)
from sklearn.metrics import accuracy_score
y_ = gcv.predict(X_test)
accuracy_score(y_test,y_)
0.9649122807017544
#另外的方法
#标准化
X_norm2 = (X - X.mean())/X.std()
from sklearn.preprocessing import MinMaxScaler,StandardScaler
mms = MinMaxScaler()
mms.fit(X)
X2 = mms.transform(X) #和归一化效果一样
ss = StandardScaler()
X3=ss.fit_transform(X)
X3 #和标准化效果一样
sklearn中数据拆分
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn.model_selection import KFold,StratifiedKFold #KFold、StratifiedKFold将数据分成多少份
data = np.random.randint(0,10,size=(8,2))
target = np.array([0,0,1,0,1,1,1,0])
# train,test是索引,只要有索引就可以获取数据
KFold = KFold(n_splits=4)
for train,test in KFold.split(data,target):
print(target[train],target[test])
[1 0 1 1 1 0] [0 0]
[0 0 1 1 1 0] [1 0]
[0 0 1 0 1 0] [1 1]
[0 0 1 0 1 1] [1 0]
#分成4分,每一份数据特征,数据样本比例和原来一样
sKFold = StratifiedKFold(n_splits=4)
for train,test in sKFold.split(data,target):
print(target[train],target[test])
[0 0 1 1 1 0] [0 1]
[0 1 0 1 1 0] [0 1]
[0 0 1 1 1 0] [0 1]
[0 0 1 0 1 1] [1 0]
#train_test_split,KFold,StratifiedKFold作用都是将数据拆分
str类型数据的转变与训练预测
data = pd.read_csv("/Users/zhucan/Desktop/salary.txt")
data.drop(labels=["final_weight","education","capital_gain","capital_loss"],axis = 1,inplace=True)
X = data.iloc[:,0:-1]
y = data.iloc["salary"]
#方法将数据中str转换int,float从而算法可以计算
#map方法,apply,transform
u = X["workclass"].unique()
u
array(['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov',
'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked'],
dtype=object)
np.argwhere(u=='Local-gov')[0,0]
4
def convert(x): #利用数字进行映射
return np.argwhere(u==x)[0,0]
X["workclass"]=X["workclass"].map(convert)
cols = ['marital_status', 'occupation','relationship', 'race', 'sex','native_country']
for col in cols:
u = X[col].unique()
def convert(x):
return np.argwhere(u==x)[0,0]
X[col] = X[col].map(convert)
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn.model_selection import KFold,StratifiedKFold
data = pd.read_csv("/Users/zhucan/Desktop/salary.txt")
data.head()
data.drop(labels=["final_weight","education","capital_gain","capital_loss"],axis = 1,inplace=True)
X = data.iloc[:,0:-1]
y = data["salary"]
u = X["workclass"].unique()
def convert(x):
return np.argwhere(u==x)[0,0]
X["workclass"]=X["workclass"].map(convert)
cols = ['marital_status','occupation','relationship','race','sex','native_country']
for col in cols:
u = X[col].unique()
def convert(x):
return np.argwhere(u==x)[0,0]
X[col] = X[col].map(convert)
knn = KNeighborsClassifier()
kFold = KFold(10)
knn = KNeighborsClassifier()
accuracy = 0
for train,test in kFold.split(X,y):
knn.fit(X.loc[train],y[train])
acc = knn.score(X.loc[test],y[test])
accuracy += acc/10
print(accuracy)
0.7973345728987424