数学建模老哥-python基础和机器学习(八)优化模型选择

该笔记为个人学习笔记,看的课程是B站-数学建模老哥:8 Python优化模型选择_哔哩哔哩_bilibili

数据集来源:数学建模老哥-python基础和机器学习(四)数据导入+数据理解+数据可视化-CSDN博客

目录

2.4.6模型优化

2.4.6.1集成算法

 2.4.6.1.1袋装算法

2.4.6.1.1.1袋装决策树

2.4.6.1.1.2随机森林

2.4.6.1.1.3极端随机树

2.4.6.1.1.4总代码

2.4.6.1.2提升算法

2.4.6.1.2.1AdaBoost

2.4.6.1.2.3总代码

2.4.6.1.3投票算法

 2.4.6.2算法调参

​编辑2.4.6.2.1机器学习算法调参

2.4.6.2.2网络搜索优化调参

2.4.6.2.3随即搜索优化调参

2.4.6.2.4总结

2.4.6.2.5总代码 


2.4.6模型优化

2.4.6.1集成算法

 

 2.4.6.1.1袋装算法

2.4.6.1.1.1袋装决策树

# 袋装决策树
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
filename = 'pima.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
array = data.values
X = array[:, 0:8]
Y = array[:, 8]
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
# 基模型--决策树
cart = DecisionTreeClassifier()
num_tree = 100
# BaggingClassifier是一个集成学习分类器,它通过组合多个基分类器(这里是决策树)的预测结果来提高整体模型的准确性和稳定性。
# estimator=cart指定了基分类器为之前创建的决策树分类器实例。n_estimators=num_tree指定了基分类器的数量,这里是100棵决策树
model = BaggingClassifier(estimator=cart, n_estimators=num_tree, random_state=seed)
# 交叉验证评估模型
result = cross_val_score(model, X, Y, cv=kfold)
print(result.mean())

结果输出:

0.7578263841421736
2.4.6.1.1.2随机森林

#随机森林
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
filename = 'pima.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
array = data.values
X = array[:, 0:8]
Y = array[:, 8]
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
# 基模型--决策树
cart = DecisionTreeClassifier()
num_tree =100
max_features =3
model = RandomForestClassifier(n_estimators=num_tree, random_state=seed, max_features=max_features)
# 交叉验证评估模型
result = cross_val_score(model, X, Y, cv=kfold)
print(result.mean())

输出结果:

0.759107997265892
2.4.6.1.1.3极端随机树

#极端随机树
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
filename = 'pima.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
array = data.values
X = array[:, 0:8]
Y = array[:, 8]
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
cart = DecisionTreeClassifier()
num_tree =100
max_features =7 #极端随机树使用的是全部特征
model = ExtraTreesClassifier(n_estimators=num_tree, random_state=seed, max_features=max_features)
# 交叉验证评估模型
result = cross_val_score(model, X, Y, cv=kfold)
print(result.mean())

输出结果:

0.7682501708817498
2.4.6.1.1.4总代码
# 袋装决策树
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
filename = 'pima.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
array = data.values
X = array[:, 0:8]
Y = array[:, 8]
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
# 基模型--决策树
cart = DecisionTreeClassifier()
num_tree = 100
# BaggingClassifier是一个集成学习分类器,它通过组合多个基分类器(这里是决策树)的预测结果来提高整体模型的准确性和稳定性。
# estimator=cart指定了基分类器为之前创建的决策树分类器实例。n_estimators=num_tree指定了基分类器的数量,这里是100棵决策树
model = BaggingClassifier(estimator=cart, n_estimators=num_tree, random_state=seed)
# 交叉验证评估模型
result = cross_val_score(model, X, Y, cv=kfold)
print(result.mean())

#随机森林
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
filename = 'pima.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
array = data.values
X = array[:, 0:8]
Y = array[:, 8]
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
# 基模型--决策树
cart = DecisionTreeClassifier()
num_tree =100
max_features =3
model = RandomForestClassifier(n_estimators=num_tree, random_state=seed, max_features=max_features)
# 交叉验证评估模型
result = cross_val_score(model, X, Y, cv=kfold)
print(result.mean())

#极端随机树
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
filename = 'pima.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
array = data.values
X = array[:, 0:8]
Y = array[:, 8]
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
cart = DecisionTreeClassifier()
num_tree =100
max_features =7 #极端随机树使用的是全部特征
model = ExtraTreesClassifier(n_estimators=num_tree, random_state=seed, max_features=max_features)
# 交叉验证评估模型
result = cross_val_score(model, X, Y, cv=kfold)
print(result.mean())

2.4.6.1.2提升算法

2.4.6.1.2.1AdaBoost

#AdaBoost
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
filename = 'pima.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
array = data.values
X = array[:, 0:8]
Y = array[:, 8]
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
num_tree =30
model = AdaBoostClassifier(n_estimators=num_tree, random_state=seed)
# 交叉验证评估模型
result = cross_val_score(model, X, Y, cv=kfold)
print(result.mean())

输出结果:

0.7552802460697198

2.4.6.1.2.2随机梯度提升

#随机梯度提升
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
filename = 'pima.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
array = data.values
X = array[:, 0:8]
Y = array[:, 8]
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
num_tree =100
model = GradientBoostingClassifier(n_estimators=num_tree, random_state=seed)
# 交叉验证评估模型
result = cross_val_score(model, X, Y, cv=kfold)
print(result.mean())

输出结果:

0.7578947368421053
2.4.6.1.2.3总代码
#AdaBoost
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
filename = 'pima.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
array = data.values
X = array[:, 0:8]
Y = array[:, 8]
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
num_tree =30
model = AdaBoostClassifier(n_estimators=num_tree, random_state=seed)
# 交叉验证评估模型
result = cross_val_score(model, X, Y, cv=kfold)
print(result.mean())

#随机梯度提升
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
filename = 'pima.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
array = data.values
X = array[:, 0:8]
Y = array[:, 8]
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
num_tree =100
model = GradientBoostingClassifier(n_estimators=num_tree, random_state=seed)
# 交叉验证评估模型
result = cross_val_score(model, X, Y, cv=kfold)
print(result.mean())

2.4.6.1.3投票算法

#投票算法
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, LogisticRegression
filename = 'pima.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
array = data.values
X = array[:, 0:8]
Y = array[:, 8]
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
cart = DecisionTreeClassifier()
models =[]
model_logistic =LogisticRegression()
models.append(('logistic',model_logistic))
model_cart =DecisionTreeClassifier()
models.append(('cart',model_cart))
model_svc =SVC()
models.append(('svm',model_svc))
ensemble_model =VotingClassifier(estimators=models)
# 交叉验证评估模型
result = cross_val_score(ensemble_model, X, Y, cv=kfold)
print(result.mean())

输出结果:

0.773479152426521

 2.4.6.2算法调参

2.4.6.2.1机器学习算法调参

2.4.6.2.2网络搜索优化调参

#网络搜索优化调参
from pandas import read_csv  # 用于读取CSV文件
from sklearn.linear_model import Ridge  # 导入Ridge回归模型
from sklearn.model_selection import GridSearchCV  # 导入网格搜索工具
filename = 'pima.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
# 将DataFrame转换为NumPy数组,方便后续操作
array = data.values
X = array[:, 0:8]
Y = array[:, 8]
# 初始化Ridge回归模型
model = Ridge()
# 设置需要遍历的参数网格
# 这里我们针对Ridge回归的alpha参数进行遍历,alpha是正则化强度的倒数,控制正则化的程度
param_grid = {'alpha': [1, 0.1, 0.01, 0.001, 0]}
# 初始化GridSearchCV对象
# estimator参数传入要优化的模型,param_grid参数传入需要遍历的参数网格
grid = GridSearchCV(estimator=model, param_grid=param_grid)
# 使用GridSearchCV进行模型训练和参数优化
# fit方法会遍历param_grid中指定的参数组合,并找到最优的参数组合
grid.fit(X, Y)
# grid.best_score_表示最优参数组合下的最高交叉验证得分
# grid.best_estimator_表示最优模型,我们可以通过它来访问最优参数
print('最高得分: %.2f' % grid.best_score_)
print('最优参数: %s' % grid.best_estimator_.alpha)

输出结果:

最高得分: 0.28
最优参数: 1

2.4.6.2.3随即搜索优化调参

#随机搜索优化参数
from pandas import read_csv
from sklearn.linear_model import Ridge  # 导入Ridge回归模型
from sklearn.model_selection import RandomizedSearchCV  # 导入随机搜索工具
from scipy.stats import uniform  # 导入均匀分布,用于定义参数搜索空间
filename = 'pima.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
# 将DataFrame转换为NumPy数组,方便后续操作
array = data.values
X = array[:, 0:8]  # 特征矩阵,包含8个特征
Y = array[:, 8]  # 目标变量数组
# 初始化Ridge回归模型
model = Ridge()
# 设置遍历的参数分布
# 这里使用uniform分布来定义alpha的可能值范围。uniform(loc=0, scale=1)表示在[0, 1)区间内均匀采样
# 注意:对于Ridge的alpha,我们可能需要一个更宽的范围,比如uniform(loc=0, scale=10)或更大,取决于数据
param_grid = {'alpha': uniform(loc=0, scale=10)}  # 根据实际情况调整范围
# 初始化RandomizedSearchCV对象
# estimator参数传入要优化的模型
# param_distributions参数传入参数分布字典
# n_iter参数指定随机采样的次数
# random_state参数用于确保结果的可重复性
grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100, random_state=7)
# 使用RandomizedSearchCV进行模型训练和参数优化
grid.fit(X, Y)
# grid.best_score_表示在随机采样的参数组合中找到的最高交叉验证得分
# grid.best_estimator_表示最优模型,我们可以通过它来访问最优参数
print('最高得分: %.3f' % grid.best_score_)  # 保留三位小数以显示更精确的分数
print('最优参数: %s' % grid.best_estimator_.alpha)  # 打印出最优的alpha值

输出结果:

最高得分: 0.276
最优参数: 9.064232691643387

2.4.6.2.4总结

2.4.6.2.5总代码 

#网络搜索优化调参
from pandas import read_csv  # 用于读取CSV文件
from sklearn.linear_model import Ridge  # 导入Ridge回归模型
from sklearn.model_selection import GridSearchCV  # 导入网格搜索工具
filename = 'pima.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
# 将DataFrame转换为NumPy数组,方便后续操作
array = data.values
X = array[:, 0:8]
Y = array[:, 8]
# 初始化Ridge回归模型
model = Ridge()
# 设置需要遍历的参数网格
# 这里我们针对Ridge回归的alpha参数进行遍历,alpha是正则化强度的倒数,控制正则化的程度
param_grid = {'alpha': [1, 0.1, 0.01, 0.001, 0]}
# 初始化GridSearchCV对象
# estimator参数传入要优化的模型,param_grid参数传入需要遍历的参数网格
grid = GridSearchCV(estimator=model, param_grid=param_grid)
# 使用GridSearchCV进行模型训练和参数优化
# fit方法会遍历param_grid中指定的参数组合,并找到最优的参数组合
grid.fit(X, Y)
# grid.best_score_表示最优参数组合下的最高交叉验证得分
# grid.best_estimator_表示最优模型,我们可以通过它来访问最优参数
print('最高得分: %.2f' % grid.best_score_)
print('最优参数: %s' % grid.best_estimator_.alpha)

#随机搜索优化参数
from pandas import read_csv
from sklearn.linear_model import Ridge  # 导入Ridge回归模型
from sklearn.model_selection import RandomizedSearchCV  # 导入随机搜索工具
from scipy.stats import uniform  # 导入均匀分布,用于定义参数搜索空间
filename = 'pima.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
# 将DataFrame转换为NumPy数组,方便后续操作
array = data.values
X = array[:, 0:8]  # 特征矩阵,包含8个特征
Y = array[:, 8]  # 目标变量数组
# 初始化Ridge回归模型
model = Ridge()
# 设置遍历的参数分布
# 这里使用uniform分布来定义alpha的可能值范围。uniform(loc=0, scale=1)表示在[0, 1)区间内均匀采样
# 注意:对于Ridge的alpha,我们可能需要一个更宽的范围,比如uniform(loc=0, scale=10)或更大,取决于数据
param_grid = {'alpha': uniform(loc=0, scale=10)}  # 根据实际情况调整范围
# 初始化RandomizedSearchCV对象
# estimator参数传入要优化的模型
# param_distributions参数传入参数分布字典
# n_iter参数指定随机采样的次数
# random_state参数用于确保结果的可重复性
grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100, random_state=7)
# 使用RandomizedSearchCV进行模型训练和参数优化
grid.fit(X, Y)
# grid.best_score_表示在随机采样的参数组合中找到的最高交叉验证得分
# grid.best_estimator_表示最优模型,我们可以通过它来访问最优参数
print('最高得分: %.3f' % grid.best_score_)  # 保留三位小数以显示更精确的分数
print('最优参数: %s' % grid.best_estimator_.alpha)  # 打印出最优的alpha值

  • 3
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值