文章目录
💨💨💨💨
个人简介:💦💦💦简介:大三在读,分享一些学习笔记和心得体会
💞💞💞兴趣领域:Python,人工智能,算法和数据结构
👁🗨👁🗨👁🗨格言:逆水行舟,不进则退!
✅✅✅目的:一起进步!
哈喽!大家好呀😁😁我是【南蓬幽】,也可以叫我小幽,小友,小呦等等
如果有不对的地方,欢迎大家指正另外如果觉得不错的话请三连支持一下博主呦💖💖💖
🎉🎉🎉🎉🎉🎊🎊欢迎大佬们呀🎊🎊🎉🎉🎉🎉🎉🎉
🥇🥇🥇1.使用sklearn的随机森林算法对样本数据进行分类,要求:
🥈🥈🥈(1) 导入乳腺癌数据集
# 首先导入内置数据集模块
from sklearn.datasets import load_breast_cancer
# 然后导入乳腺癌数据集
cancer = load_breast_cancer()
🏆🏆🏆(2) 对比随机森林算法和决策树的分类效果;
# 定义一个决策树分类器对象用于做比较
dt = DecisionTreeClassifier(random_state=0)
# 定义一个随机森林分类器对象
rf = RandomForestClassifier(random_state=0)
dt.fit(x_train,y_train)
rf.fit(x_train,y_train)
score_dt = dt.score(x_test,y_test)
score_rf = rf.score(x_test,y_test)
使用cross_val_score进行交叉验证,其中:
cv为份数,即将数据集划分为n分,依次取每一份做测试集,其他n-1份做训练集,
#返回每次测试准确率评分的列表
for i in range(10):
rf_score = cross_val_score(RandomForestClassifier(n_estimators=25), cancer.data,
cancer.target, cv=10).mean()
rf_scores.append(rf_score)
dt_score = cross_val_score(DecisionTreeClassifier(), cancer.data, cancer.target, cv=10).mean()
dt_scores.append(dt_score)
🏀🏀🏀(3) 测试弱分类器个数n_estimators对分类精度的影响。
rf_scores = []
for i in range(1,50):
rf = RandomForestClassifier(n_estimators=i)
rf_score = cross_val_score(rf, cancer.data, cancer.target, cv=10).mean()
rf_scores.append(rf_score)
完整代码:
# 导入内置数据集模块
from sklearn.datasets import load_breast_cancer
# 导入sklearn模块中的决策树分类器类和随机森林分类器类
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# 导入sklearn模块中的模型验证类
from sklearn.model_selection import train_test_split,cross_val_score
import matplotlib.pyplot as plt
# 导入乳腺癌数据集
cancer = load_breast_cancer()
# 使用train_test_split函数自动分割训练集与测试集,其中test_size为测试集所占比例
x_train, x_test, y_train, y_test = train_test_split(cancer.data,cancer.target,test_size=0.3)
# 定义一个决策树分类器对象用于做比较
dt = DecisionTreeClassifier(random_state=0)
# 定义一个随机森林分类器对象
rf = RandomForestClassifier(random_state=0)
dt.fit(x_train,y_train)
rf.fit(x_train,y_train)
score_dt = dt.score(x_test,y_test)
score_rf = rf.score(x_test,y_test)
# 输出准确率
print('Single Tree : ', score_dt)
print('Random Forest : ', score_rf)
dt_scores = []
rf_scores = []
# 使用cross_val_score进行交叉验证,其中:
# cv为份数,即将数据集划分为n分,依次取每一份做测试集,其他n-1份做训练集,
# 返回每次测试准确率评分的列表
for i in range(10):
rf_score = cross_val_score(RandomForestClassifier(n_estimators=25), cancer.data,
cancer.target, cv=10).mean()
rf_scores.append(rf_score)
dt_score = cross_val_score(DecisionTreeClassifier(), cancer.data, cancer.target, cv=10).mean()
dt_scores.append(dt_score)
# 绘制评分对比曲线
plt.figure()
plt.title('Random Forest VS Decision Tree')
plt.xlabel('Index')
plt.ylabel('Accuracy')
plt.plot(range(10),rf_scores,label = 'Random Forest')
plt.plot(range(10),dt_scores,label = 'Decision Tree')
plt.legend()
plt.show()
# 观察弱分类器数量对分类准确度的影响
rf_scores = []
for i in range(1,50):
rf = RandomForestClassifier(n_estimators=i)
rf_score = cross_val_score(rf, cancer.data, cancer.target, cv=10).mean()
rf_scores.append(rf_score)
plt.figure()
plt.title('Random Forest')
plt.xlabel('n_estimators')
plt.ylabel('Accuracy')
plt.plot(range(1,50),rf_scores)
plt.show()
可视化结果:
⚽⚽⚽⚽2.使用sklearn的AdaBoost和GradientBoost对乳腺癌数据进行分类,要求:
两个方法相同,一个为例
🎖🎖🎖(1) 导入乳腺癌数据集
# 首先导入内置数据集模块
from sklearn.datasets import load_breast_cancer
# 然后导入乳腺癌数据集
cancer = load_breast_cancer()
🎲🎲🎰(2) 测试learning_rate参数对分类效果的影响;
# 测试learning_rate参数对分类效果的影响
abc_scores = []
for i in np.arange(0.1,1,0.05):
abc.learning_rate = i
abc.fit(x_train, y_train)
abc_score = abc.score(x_test, y_test)
abc_scores.append(abc_score)
👓👓🕶(3) 测试n_estimators参数对分类效果的影响。
# 测试n_estimators参数对分类效果的影响
abc_scores = []
for i in range(1,50):
abc.estimators_ = i
abc.fit(x_train, y_train)
abc_score = abc.score(x_test, y_test)
abc_scores.append(abc_score)
完整代码:
1.AdaBoost
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split,cross_val_score
import matplotlib.pyplot as plt
import numpy as np
cancer = load_breast_cancer()
x_train, x_test, y_train, y_test = train_test_split(cancer.data,cancer.target, test_size=0.3,random_state=1)
dt = DecisionTreeClassifier()
abc = AdaBoostClassifier(DecisionTreeClassifier(), algorithm='SAMME.R', n_estimators=20,learning_rate=0.1)
abc.fit(x_train,y_train)
dt.fit(x_train,y_train)
score_abc = abc.score(x_test,y_test)
score_dt = dt.score(x_test,y_test)
# 输出准确率
print('Ada Boost : ', score_abc)
print('Decision Tree : ', score_dt)
# 测试learning_rate参数对分类效果的影响
abc_scores = []
for i in np.arange(0.1,1,0.05):
abc.learning_rate = i
abc.fit(x_train, y_train)
abc_score = abc.score(x_test, y_test)
abc_scores.append(abc_score)
# 绘制测试结果
plt.figure()
plt.title('AdaBoost')
plt.xlabel('learning_rate')
plt.ylabel('Accuracy')
plt.plot(range(len(abc_scores)),abc_scores)
plt.show()
# 测试n_estimators参数对分类效果的影响
abc_scores = []
for i in range(1,50):
abc.estimators_ = i
abc.fit(x_train, y_train)
abc_score = abc.score(x_test, y_test)
abc_scores.append(abc_score)
# 绘制结果
plt.figure()
plt.title('AdaBoost')
plt.xlabel('n_estimators')
plt.ylabel('Accuracy')
plt.plot(range(1,50),abc_scores)
plt.show()
# 使用cross_val_score进行交叉验证
abc_scores = []
dt_scores = []
for i in range(20):
abc_score = cross_val_score(abc, cancer.data, cancer.target, cv=10).mean()
abc_scores.append(abc_score)
dt_score = cross_val_score(dt, cancer.data, cancer.target, cv=10).mean()
dt_scores.append(dt_score)
# 绘制评分对比曲线
plt.figure()
plt.title('AdaBoost VS Decision Tree')
plt.xlabel('Index')
plt.ylabel('Accuracy')
plt.plot(range(20),dt_scores,label = 'Decision Tree')
plt.plot(range(20),abc_scores,label = 'AdaBoost')
plt.legend()
plt.show()
可视化结果:
2.Gradient Boost
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split,cross_val_score
import matplotlib.pyplot as plt
import numpy as np
cancer = load_breast_cancer()
x_train, x_test, y_train, y_test = train_test_split(cancer.data,cancer.target,test_size=0.3,random_state=1)
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1)
dt = DecisionTreeClassifier()
gbc.fit(x_train,y_train)
dt.fit(x_train,y_train)
score_gbc = gbc.score(x_test,y_test)
score_dt = dt.score(x_test,y_test)
# 输出准确率
print('Gradient Boost : ', score_gbc)
print('Decision Tree : ', score_dt)
# 测试learning_rate参数对分类效果的影响
gbc_scores = []
for i in np.arange(0.1,1,0.05):
gbc.learning_rate = i
gbc.fit(x_train, y_train)
gbc_score = gbc.score(x_test, y_test)
gbc_scores.append(gbc_score)
# 绘制测试结果
plt.figure()
plt.title('Gradient Boost')
plt.xlabel('learning_rate')
plt.ylabel('Accuracy')
plt.plot(range(len(gbc_scores)),gbc_scores)
plt.show()
# 测试n_estimators参数对分类效果的影响
gbc_scores = []
for i in range(1,50):
gbc.estimators_ = i
gbc.fit(x_train, y_train)
gbc_score = gbc.score(x_test, y_test)
gbc_scores.append(gbc_score)
# 绘制结果
plt.figure()
plt.title('Gradient Boost')
plt.xlabel('n_estimators')
plt.ylabel('Accuracy')
plt.plot(range(1,50),gbc_scores)
plt.show()
gbc_scores = []
dt_scores = []
# 使用cross_val_score进行交叉验证
for i in range(20):
gbc_score = cross_val_score(gbc, cancer.data, cancer.target, cv=10).mean()
gbc_scores.append(gbc_score)
dt_score = cross_val_score(dt, cancer.data, cancer.target, cv=10).mean()
dt_scores.append(dt_score)
# 绘制评分对比曲线
plt.figure()
plt.title('Gradient Boost VS Decision Tree')
plt.xlabel('Index')
plt.ylabel('Accuracy')
plt.plot(range(20),dt_scores,label = 'Decision Tree')
plt.plot(range(20),gbc_scores,label = 'Gradient Boost')
plt.legend()
plt.show()