1.题干
植物物种的分类:据统计,目前仅被植物学家记录的植物物种就有 25 万种之多。植物物种的正确分类对保护和研究植物多样性具有重要意义。这里以植物叶片数据集为研究对象,希望基于叶片特征,通过机器学习自动实现植物物种的分类。数据集(文件名:叶子形状.csv)是关于990张植物叶片灰度图像见图的转换数据。描述植物叶片的边缘(margin)、形状(shape)、纹理(texture)这三个特征的数值型变量各有64个(共192个输入变量)。此外,还有1个分类型变量记录了每片叶片所属的植物物种(species)。总共有 193个变量。请首先建立单棵回归树的分类模型,然后采用各种集成算法进行分类预测,并基于单棵回归树和集成学习的误差对比,选出最优预测模型。
2.数据格式
3.代码
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
def printf(n, strf):
print()
print('-' * n)
print(f"\033[1m{strf}\033[0m")
print()
data = pd.read_csv('叶子形状.csv')
printf(100, '查看数据大致情况')
print(data.head())
X = data.drop(columns=['species'])
y = data['species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# 建立单棵决策树分类模型
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
y_train_pred_dt = dt_model.predict(X_train)
y_test_pred_dt = dt_model.predict(X_test)
train_accuracy_dt = accuracy_score(y_train, y_train_pred_dt)
test_accuracy_dt = accuracy_score(y_test, y_test_pred_dt)
print(f"单棵决策树 - 训练准确率: {train_accuracy_dt}")
print(f"单棵决策树 - 测试准确率: {test_accuracy_dt}")
# 随机森林分类器
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train, y_train)
y_train_pred_rf = rf_model.predict(X_train)
y_test_pred_rf = rf_model.predict(X_test)
train_accuracy_rf = accuracy_score(y_train, y_train_pred_rf)
test_accuracy_rf = accuracy_score(y_test, y_test_pred_rf)
print(f"随机森林 - 训练准确率: {train_accuracy_rf}")
print(f"随机森林 - 测试准确率: {test_accuracy_rf}")
# 梯度提升分类器
gb_model = GradientBoostingClassifier(n_estimators=100)
gb_model.fit(X_train, y_train)
y_train_pred_gb = gb_model.predict(X_train)
y_test_pred_gb = gb_model.predict(X_test)
train_accuracy_gb = accuracy_score(y_train, y_train_pred_gb)
test_accuracy_gb = accuracy_score(y_test, y_test_pred_gb)
print(f"梯度提升 - 训练准确率: {train_accuracy_gb}")
print(f"梯度提升 - 测试准确率: {test_accuracy_gb}")
# AdaBoost分类器
ab_model = AdaBoostClassifier(n_estimators=100)
ab_model.fit(X_train, y_train)
y_train_pred_ab = ab_model.predict(X_train)
y_test_pred_ab = ab_model.predict(X_test)
train_accuracy_ab = accuracy_score(y_train, y_train_pred_ab)
test_accuracy_ab = accuracy_score(y_test, y_test_pred_ab)
print(f"AdaBoost - 训练准确率: {train_accuracy_ab}")
print(f"AdaBoost - 测试准确率: {test_accuracy_ab}")
models = ['Decision Tree', 'Random Forest', 'Gradient Boosting', 'AdaBoost']
train_accuracies = [train_accuracy_dt, train_accuracy_rf, train_accuracy_gb, train_accuracy_ab]
test_accuracies = [test_accuracy_dt, test_accuracy_rf, test_accuracy_gb, test_accuracy_ab]
plt.figure(figsize=(12, 6))
plt.plot(models, train_accuracies, label='训练准确率', marker='o')
plt.plot(models, test_accuracies, label='测试准确率', marker='o')
plt.ylabel('准确率')
plt.title('不同模型的准确率对比')
plt.legend()
plt.show()
best_model_index = test_accuracies.index(max(test_accuracies))
best_model_name = models[best_model_index]
best_model_accuracy = test_accuracies[best_model_index]
print(f"最优预测模型: {best_model_name},测试准确率: {best_model_accuracy}")
值得注意的是, AdaBoost分类器的准确率无论在测试集上还是在训练集上都显著的低,故通过网格搜索交叉验证的方法进行调参如下:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
data = pd.read_csv('叶子形状.csv')
X = data.drop(columns=['species'])
y = data['species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
param_grid = {
'base_estimator': [DecisionTreeClassifier(max_depth=4), DecisionTreeClassifier(max_depth=3)],
'n_estimators': [50, 100, 150],
'learning_rate': [0.01, 0.1, 1],
'algorithm': ['SAMME', 'SAMME.R']
}
ab_model = AdaBoostClassifier()
grid_search = GridSearchCV(ab_model, param_grid, cv=4, verbose=1)
def fit_model_and_evaluate(model, X_train, y_train, X_test, y_test):
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
return train_accuracy, test_accuracy
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
train_accuracy, test_accuracy = fit_model_and_evaluate(best_model, X_train, y_train, X_test, y_test)
print("最佳超参数组合: ", grid_search.best_params_)
print(f"AdaBoost - 训练准确率: {train_accuracy}")
print(f"AdaBoost - 测试准确率: {test_accuracy}")
结果为:
由于指定了随机数种子,此结果任何人均可复现。(注:仅提供方法,不代表此参数为全局最优)