1、数据获取
数据下载:
https://archive.ics.uci.edu/ml/index.php
https://sonj.me/projects/2018/09/05/poisonous-mushroom-classification.html
2、数据清理
a) 查看数据行列情况,判断是否有空行,进行删除;
b) 查看空值情况,按照自己分析的需求及考虑进行删除或填充操作;
import pandas as pd
import pandas as pd
import numpy as np
import pydotplus
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
mushroom = 'dataset.csv'
mushroom = pd.read_csv(mushroom, sep=',', decimal='.')
mushroom=pd.DataFrame(mushroom)
print(type(mushroom))
print(mushroom)
# 查看数据行列情况
print("数列行列情况:",mushroom.shape)
print("存在空行数目:",mushroom.isnull().T.any().sum())
print("空值情况:\n",mushroom.isnull().sum())
print(mushroom.iloc[:,1:])
将数据数值化
X = mushroom.iloc[:,1:]
y = mushroom["class"]
#将数据进行数值化
i = 0
while(i<22):
y_flag = X.iloc[:,i].unique()
X.iloc[:,i] = X.iloc[:,i].apply(lambda x : y_flag.tolist().index(x))
i = i+1
# print(X.head())
y = y.map(dict(zip(['e','p'],[0,1])))
3、使用决策树,KNN以及朴素贝叶斯方法进行预测,并给出至少两种方法的评价结果
4、对不同的分类模型结果运用多种度量指标进行评估,分析说明预测错误的风险。
混淆矩阵评价函数
# 定义交叉矩阵函数
import itertools
# 绘制真实值和预测值对比情况
def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=0)
plt.yticks(tick_marks, classes)
threshold = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment='center',
color="white" if cm[i, j] > threshold else "black") # 若对应格子上面的数量不超过阙值则字体为白色,否则为黑
plt.tight_layout()
plt.ylabel("True label")
plt.xlabel('Predicted label')
# 计算召回率
print("召回率:", cm[1, 1] / cm[1].sum())
plt.show()
决策树:
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=4)
# 创建决策树对象,使用信息熵作为依据
clf = tree.DecisionTreeClassifier(criterion='entropy')
clf.fit(x_train,y_train)
#查看评分即准确率
score=clf.score(x_test,y_test)
print("决策树的准确率为:",score)
#画出决策树
feature_name=['cap-shape','cap-surface','cap-color','bruises','odor','gill-attachment','gill-spacing','gill-size','gill-color','stalk-shape'
,'stalk-root','stalk-surface-above-ring','stalk-surface-below-ring','stalk-color-above-ring','stalk-color-below-ring','veil-type'
,'veil-color','ring-number','ring-type','spore-print-color','population','habitat']
import graphviz
dot_data=tree.export_graphviz(clf
,feature_names=feature_name
,class_names=['有毒','无毒']
,filled=True
,rounded=True)
graph=graphviz.Source(dot_data)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.get_nodes()[7].set_fillcolor("#FFF2DD")
graph.write_png('mushroot.png')
#生成混淆矩阵,查看召回率
y_ = clf.predict(x_test)
#生成一个混淆矩阵
cm = confusion_matrix(y_test,y_)
plot_confusion_matrix(cm,classes=[0,1])
决策树效果:(未解决中文乱码问题)
KNN:
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=4)
neigh=KNeighborsClassifier(n_neighbors=3)
neigh.fit(x_train,y_train)
score=neigh.score(x_test,y_test)
print("KNN的准确率为:",score)
#生成混淆矩阵,查看召回率
y_ = neigh.predict(x_test)
#先生成一个混淆矩阵
cm = confusion_matrix(y_test,y_)
plot_confusion_matrix(cm,classes=[0,1])
朴素贝叶斯:
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=4)
GaN = GaussianNB()
GaN.fit(x_train,y_train)
score=GaN.score(x_test,y_test)
print("朴素贝叶斯分类准确率:",score)
#生成混淆矩阵,查看召回率
y_ = GaN.predict(x_test)
#先生成一个混淆矩阵
cm = confusion_matrix(y_test,y_)
plot_confusion_matrix(cm,classes=[0,1])
实验结果分析:
由实验结果可知,使用决策树算法及KNN算法准确率及召回率均为1.0,即预测结果完全正确,具有很好的说服力。对于朴素贝叶斯算法,准确率为93.3%,召回率为88.3%,从准确率及召回率值来看,朴素贝叶斯分类算法可以用于预测,但与决策树及KNN算法相比,朴素贝叶斯算法的预测能力还是不够的。
由于分析对象是蘑菇是否有毒,对于食用物来说我们要提高召回率即查全率,相比三种模型,决策树和KNN都将准确率和召回率达到最大值,可很有说服力的作为预测模型,相比较而言朴素贝叶斯召回率只有88.3%,由于有更好的预测模型,因此可以忽略朴素贝叶斯方法。