Section I: Load package
#Section 1: Load package
from sklearn import datasets
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score
plt.rcParams['figure.dpi']=200
plt.rcParams['savefig.dpi']=200
font = {'family': 'Times New Roman',
'weight': 'light'}
plt.rc("font", **font)
Section II: Load data and split them into train/test dataset
#Section 2: Load data and split it into train/test dataset
iris=datasets.load_iris()
X=iris.data[:,[2,3]]
y=iris.target
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=1,stratify=y)
print('Label counts in y:',np.bincount(y))
Section III: Train perceptron model
#Section 3: Train perceptron model
sc=StandardScaler()
sc.fit(X_train)
X_train_std=sc.transform(X_train)
X_test_std=sc.transform(X_test)
ppn=Perceptron(n_iter_no_change=40,eta0=0.1,random_state=1)
ppn.fit(X_train_std,y_train)
y_pred=ppn.predict(X_test_std)
print('Misclassified samples: %d' % (y_test!=y_pred).sum())
print('Accuracy: %.2f' % accuracy_score(y_test,y_pred))
print('Accuracy: %.2f' % ppn.score(X_test_std,y_test))
Section IV: Visualize decision boundary
此小节包含两个部分,分别为“plot_decision_regions”函数的定义和调用及可视化参数调整。
第一部分:
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import numpy as np
def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
markers=('s','x','o','^','v')
colors=('red','blue','lightgreen','gray','cyan')
cmap=ListedColormap(colors[:len(np.unique(y))])
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, resolution),
np.arange(y_min, y_max, resolution))
Z = classifier.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.4,cmap=cmap)
plt.scatter(X[:, 0], X[:, 1], c=y, alpha=0.8)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
for idx,cl in enumerate(np.unique(y)):
plt.scatter(x=X[y==cl,0],
y=X[y==cl,1],
alpha=0.8,
c=colors[idx],
marker=markers[idx],
label=cl,
edgecolors='black')
#Highlight test samples
if test_idx:
X_test,y_test=X[test_idx,:],y[test_idx]
plt.scatter(X_test[:,0],X_test[:,1],c='',edgecolor='black',alpha=1.0,
linewidth=1,marker='o',s=100,label='test set')
第二部分:
#Section 4: Visualize decision boundary
from Perceptron_Sklearn.visualize import plot_decision_regions
X_combined_std=np.vstack((X_train_std,X_test_std))
y_combined=np.hstack((y_train,y_test))
plot_decision_regions(X=X_combined_std,
y=y_combined,
classifier=ppn,
test_idx=range(105,150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.show()
小结:
基于此,以petal长度和宽度两位特征,构造划分空间。以Sklearn_Learn包的感知机Perceptron进行训练,并以plot_decision_regions函数辅助可视化后,决策边界可分为3个部分。不同于自编Perceptron,Sklearn的感知机支持多分类,主要在于OvR的应用,即每分类一个,其它不同于此类的均视作第二种类别。
参考文献
Sebastian Raschka, Vahid Mirjalili. Python机器学习第二版. 南京:东南大学出版社,2018.