- 用PCA对小麦数据的特征进行降维,降到二维
- 将用PCA降维后的数据送入ANN(人工神经网络)进行训练
- 用训练好的ANN进行分类
将txt文件转成pandas表格
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import numpy
import pandas as pd
import numpy as np
fp=open('seeds_dataset.txt','r')
ls=[]
for line in fp:
line=line.strip('\n') #将\n去掉
ls.append(line.split('\t')) #将空格作为分隔符将一个字符切割成一个字符数组
fp.close()
ls=numpy.array(ls,dtype=float) #将其转换成numpy的数组,并定义数据类型为float
print(ls)
# 将文件转为DataFrame表格
ls_pd = pd.DataFrame(ls)
把数据打乱
from sklearn.utils import shuffle
ls_pd = shuffle(ls_pd)
y = ls_pd.loc[:,7]
对数据进行预处理
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
data_std = sc.fit_transform(ls_pd.loc[:,1:6])
PCA特征降维
def plot_PCA(*data):
''' 绘制经过 KernelPCA 降维到二维之后的样本点'''
X,y=data
kernels=['linear','poly','rbf','sigmoid']
fig=plt.figure()
colors=((1,0,0),(0,1,0),(0,0,1),(0.5,0.5,0),(0,0.5,0.5),(0.5,0,0.5),
(0.4,0.6,0),(0.6,0.4,0),(0,0.6,0.4),(0.5,0.3,0.2),)# 颜色集合,不同标记的样本染不同的颜色
kpca= PCA(n_components=2)
kpca.fit(X)
X_r=kpca.transform(X)# 原始数据集转换到二维
for label ,color in zip( np.unique(y),colors):
position=y==label
plt.scatter(X_r[position,0],X_r[position,1],label="target= %d"%label,
color=color)
plt.suptitle("PCA")
plt.show()
return X_r
reduced_x = plot_PCA(data_std,y)
进行one-hot编码
y_onehot = list()
for value in y:
letter = [0 for _ in range(3)]
letter[int(value)-1] = 1
y_onehot.append(letter)
print(y_onehot)
对数据集进行分割
from sklearn.model_selection import train_test_split
x_train, x_test = train_test_split(reduced_x, test_size=0.2, shuffle = False)
y_train, y_test = train_test_split(y, test_size=0.2, shuffle = False)
y_train_onehot, y_test_onehot = train_test_split(y_onehot, test_size=0.2, shuffle = False)
y_onehot = np.array(y_onehot)
y_train_onehot, y_test_onehot = train_test_split(y_onehot, test_size=0.2, shuffle = False)
ANN分类
from tensorflow import keras
建立模型
model = keras.Sequential([
keras.layers.Dense(500,activation='relu',input_shape=[2]),#输入特征数目为4
keras.layers.Dense(500,activation='relu'),
keras.layers.Dense(250,activation='relu'),
keras.layers.Dense(250,activation='relu'),
keras.layers.Dense(3, activation='softmax')])#输出的类别为3个,所以输出层3个节点
编译模型
# 编译模型,定义损失函数loss,采用的优化器optimizer为Adam
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
拟合模型
model.fit(x_train,y_train_onehot,batch_size = 32,epochs=20)#训练1000个批次,每个批次数据量为126
用模型进行预测
y_pre=model.predict(x_test).argmax(axis=1)#开始预测,axis=1表示返回每行中数值(表示每个类别的概率)最大的下标,就是对应的标签
y_pre = y_pre + 1
模型评估
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, recall_score
acu = accuracy_score(y_test, y_pre) # 准确率
recall = recall_score(y_test, y_pre, average="macro") # 召回率
画出分类结果
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import numpy as np
def plot_decision_regions(X, y, classifier,test_idx = None, resolution=0.02):
#setup marker generator and colormap
markers = ('s','x','o','^','v')
colors = ('red','blue','lightgreen','gray','cyan')
cmap = ListedColormap(colors[: len(np.unique(y))])
# plot the decision surface
x1_min, x1_max = X[:,0].min() -1, X[:,0].max()+1
x2_min, x2_max = X[:,1].min() -1, X[:,1].max()+1
# X[:,k] 冒号左边表示行范围,读取所有行,冒号右边表示列范围,读取第K列
xx1, xx2 = np.meshgrid(np.arange(x1_min,x1_max,resolution),
np.arange(x2_min,x2_max,resolution))
#arange(start,end,step) 返回一个一维数组
#meshgrid(x,y)产生一个以x为行,y为列的矩阵
#xx1是一个(305*235)大小的矩阵 xx1.ravel()是将所有的行放在一个行里面的长度71675的一维数组
#xx2同理
Z=classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T).argmax(axis=1) + 1
#np.array([xx1.ravel(), xx2.ravel()]) 生成了一个 (2*71675)的矩阵
# xx1.ravel() = (1,71675)
#xx1.shape = (305,205) 将Z重新调整为(305,205)的格式
Z = Z.reshape(xx1.shape)
plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
plt.xlim(xx1.min(), xx1.max())
plt.ylim(xx2.min(), xx2.max())
# plot class samples
print(np.unique(y))
# idx = 0,1 cl = -1 1
for idx, cl in enumerate(np.unique(y)):
plt.scatter(x=X[y==cl, 0], y=X[y==cl, 1],
alpha=0.8, c=cmap(idx),
marker = markers[idx],label = cl)
#highlight test samples
#增加的模块
if test_idx:
X_test, y_test = X[test_idx:,:],y[test_idx:]
plt.scatter(X_test[:,0],X_test[:,1],c='',edgecolors='0',
alpha=1.0, linewidths=1,marker='o',
s=55, label='test set')
plot_decision_regions(reduced_x, y,
classifier=model, test_idx=167)
plt.legend(loc='upper left')
plt.tight_layout() #紧凑显示图片,居中显示;避免出现叠影
plt.show()
处理后的结果如下图: