该笔记为个人学习笔记,看的课程是B站-数学建模老哥:8 Python优化模型选择_哔哩哔哩_bilibili
数据集来源:数学建模老哥-python基础和机器学习(四)数据导入+数据理解+数据可视化-CSDN博客
目录
2.4.6模型优化
2.4.6.1集成算法
2.4.6.1.1袋装算法
2.4.6.1.1.1袋装决策树
# 袋装决策树
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
filename = 'pima.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
array = data.values
X = array[:, 0:8]
Y = array[:, 8]
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
# 基模型--决策树
cart = DecisionTreeClassifier()
num_tree = 100
# BaggingClassifier是一个集成学习分类器,它通过组合多个基分类器(这里是决策树)的预测结果来提高整体模型的准确性和稳定性。
# estimator=cart指定了基分类器为之前创建的决策树分类器实例。n_estimators=num_tree指定了基分类器的数量,这里是100棵决策树
model = BaggingClassifier(estimator=cart, n_estimators=num_tree, random_state=seed)
# 交叉验证评估模型
result = cross_val_score(model, X, Y, cv=kfold)
print(result.mean())
结果输出:
0.7578263841421736
2.4.6.1.1.2随机森林
#随机森林
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
filename = 'pima.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
array = data.values
X = array[:, 0:8]
Y = array[:, 8]
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
# 基模型--决策树
cart = DecisionTreeClassifier()
num_tree =100
max_features =3
model = RandomForestClassifier(n_estimators=num_tree, random_state=seed, max_features=max_features)
# 交叉验证评估模型
result = cross_val_score(model, X, Y, cv=kfold)
print(result.mean())
输出结果:
0.759107997265892
2.4.6.1.1.3极端随机树
#极端随机树
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
filename = 'pima.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
array = data.values
X = array[:, 0:8]
Y = array[:, 8]
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
cart = DecisionTreeClassifier()
num_tree =100
max_features =7 #极端随机树使用的是全部特征
model = ExtraTreesClassifier(n_estimators=num_tree, random_state=seed, max_features=max_features)
# 交叉验证评估模型
result = cross_val_score(model, X, Y, cv=kfold)
print(result.mean())
输出结果:
0.7682501708817498
2.4.6.1.1.4总代码
# 袋装决策树
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
filename = 'pima.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
array = data.values
X = array[:, 0:8]
Y = array[:, 8]
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
# 基模型--决策树
cart = DecisionTreeClassifier()
num_tree = 100
# BaggingClassifier是一个集成学习分类器,它通过组合多个基分类器(这里是决策树)的预测结果来提高整体模型的准确性和稳定性。
# estimator=cart指定了基分类器为之前创建的决策树分类器实例。n_estimators=num_tree指定了基分类器的数量,这里是100棵决策树
model = BaggingClassifier(estimator=cart, n_estimators=num_tree, random_state=seed)
# 交叉验证评估模型
result = cross_val_score(model, X, Y, cv=kfold)
print(result.mean())
#随机森林
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
filename = 'pima.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
array = data.values
X = array[:, 0:8]
Y = array[:, 8]
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
# 基模型--决策树
cart = DecisionTreeClassifier()
num_tree =100
max_features =3
model = RandomForestClassifier(n_estimators=num_tree, random_state=seed, max_features=max_features)
# 交叉验证评估模型
result = cross_val_score(model, X, Y, cv=kfold)
print(result.mean())
#极端随机树
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
filename = 'pima.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
array = data.values
X = array[:, 0:8]
Y = array[:, 8]
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
cart = DecisionTreeClassifier()
num_tree =100
max_features =7 #极端随机树使用的是全部特征
model = ExtraTreesClassifier(n_estimators=num_tree, random_state=seed, max_features=max_features)
# 交叉验证评估模型
result = cross_val_score(model, X, Y, cv=kfold)
print(result.mean())
2.4.6.1.2提升算法
2.4.6.1.2.1AdaBoost
#AdaBoost
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
filename = 'pima.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
array = data.values
X = array[:, 0:8]
Y = array[:, 8]
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
num_tree =30
model = AdaBoostClassifier(n_estimators=num_tree, random_state=seed)
# 交叉验证评估模型
result = cross_val_score(model, X, Y, cv=kfold)
print(result.mean())
输出结果:
0.7552802460697198
2.4.6.1.2.2随机梯度提升
#随机梯度提升
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
filename = 'pima.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
array = data.values
X = array[:, 0:8]
Y = array[:, 8]
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
num_tree =100
model = GradientBoostingClassifier(n_estimators=num_tree, random_state=seed)
# 交叉验证评估模型
result = cross_val_score(model, X, Y, cv=kfold)
print(result.mean())
输出结果:
0.7578947368421053
2.4.6.1.2.3总代码
#AdaBoost
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
filename = 'pima.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
array = data.values
X = array[:, 0:8]
Y = array[:, 8]
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
num_tree =30
model = AdaBoostClassifier(n_estimators=num_tree, random_state=seed)
# 交叉验证评估模型
result = cross_val_score(model, X, Y, cv=kfold)
print(result.mean())
#随机梯度提升
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
filename = 'pima.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
array = data.values
X = array[:, 0:8]
Y = array[:, 8]
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
num_tree =100
model = GradientBoostingClassifier(n_estimators=num_tree, random_state=seed)
# 交叉验证评估模型
result = cross_val_score(model, X, Y, cv=kfold)
print(result.mean())
2.4.6.1.3投票算法
#投票算法
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, LogisticRegression
filename = 'pima.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
array = data.values
X = array[:, 0:8]
Y = array[:, 8]
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
cart = DecisionTreeClassifier()
models =[]
model_logistic =LogisticRegression()
models.append(('logistic',model_logistic))
model_cart =DecisionTreeClassifier()
models.append(('cart',model_cart))
model_svc =SVC()
models.append(('svm',model_svc))
ensemble_model =VotingClassifier(estimators=models)
# 交叉验证评估模型
result = cross_val_score(ensemble_model, X, Y, cv=kfold)
print(result.mean())
输出结果:
0.773479152426521
2.4.6.2算法调参
2.4.6.2.1机器学习算法调参
2.4.6.2.2网络搜索优化调参
#网络搜索优化调参
from pandas import read_csv # 用于读取CSV文件
from sklearn.linear_model import Ridge # 导入Ridge回归模型
from sklearn.model_selection import GridSearchCV # 导入网格搜索工具
filename = 'pima.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
# 将DataFrame转换为NumPy数组,方便后续操作
array = data.values
X = array[:, 0:8]
Y = array[:, 8]
# 初始化Ridge回归模型
model = Ridge()
# 设置需要遍历的参数网格
# 这里我们针对Ridge回归的alpha参数进行遍历,alpha是正则化强度的倒数,控制正则化的程度
param_grid = {'alpha': [1, 0.1, 0.01, 0.001, 0]}
# 初始化GridSearchCV对象
# estimator参数传入要优化的模型,param_grid参数传入需要遍历的参数网格
grid = GridSearchCV(estimator=model, param_grid=param_grid)
# 使用GridSearchCV进行模型训练和参数优化
# fit方法会遍历param_grid中指定的参数组合,并找到最优的参数组合
grid.fit(X, Y)
# grid.best_score_表示最优参数组合下的最高交叉验证得分
# grid.best_estimator_表示最优模型,我们可以通过它来访问最优参数
print('最高得分: %.2f' % grid.best_score_)
print('最优参数: %s' % grid.best_estimator_.alpha)
输出结果:
最高得分: 0.28
最优参数: 1
2.4.6.2.3随即搜索优化调参
#随机搜索优化参数
from pandas import read_csv
from sklearn.linear_model import Ridge # 导入Ridge回归模型
from sklearn.model_selection import RandomizedSearchCV # 导入随机搜索工具
from scipy.stats import uniform # 导入均匀分布,用于定义参数搜索空间
filename = 'pima.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
# 将DataFrame转换为NumPy数组,方便后续操作
array = data.values
X = array[:, 0:8] # 特征矩阵,包含8个特征
Y = array[:, 8] # 目标变量数组
# 初始化Ridge回归模型
model = Ridge()
# 设置遍历的参数分布
# 这里使用uniform分布来定义alpha的可能值范围。uniform(loc=0, scale=1)表示在[0, 1)区间内均匀采样
# 注意:对于Ridge的alpha,我们可能需要一个更宽的范围,比如uniform(loc=0, scale=10)或更大,取决于数据
param_grid = {'alpha': uniform(loc=0, scale=10)} # 根据实际情况调整范围
# 初始化RandomizedSearchCV对象
# estimator参数传入要优化的模型
# param_distributions参数传入参数分布字典
# n_iter参数指定随机采样的次数
# random_state参数用于确保结果的可重复性
grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100, random_state=7)
# 使用RandomizedSearchCV进行模型训练和参数优化
grid.fit(X, Y)
# grid.best_score_表示在随机采样的参数组合中找到的最高交叉验证得分
# grid.best_estimator_表示最优模型,我们可以通过它来访问最优参数
print('最高得分: %.3f' % grid.best_score_) # 保留三位小数以显示更精确的分数
print('最优参数: %s' % grid.best_estimator_.alpha) # 打印出最优的alpha值
输出结果:
最高得分: 0.276
最优参数: 9.064232691643387
2.4.6.2.4总结
2.4.6.2.5总代码
#网络搜索优化调参
from pandas import read_csv # 用于读取CSV文件
from sklearn.linear_model import Ridge # 导入Ridge回归模型
from sklearn.model_selection import GridSearchCV # 导入网格搜索工具
filename = 'pima.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
# 将DataFrame转换为NumPy数组,方便后续操作
array = data.values
X = array[:, 0:8]
Y = array[:, 8]
# 初始化Ridge回归模型
model = Ridge()
# 设置需要遍历的参数网格
# 这里我们针对Ridge回归的alpha参数进行遍历,alpha是正则化强度的倒数,控制正则化的程度
param_grid = {'alpha': [1, 0.1, 0.01, 0.001, 0]}
# 初始化GridSearchCV对象
# estimator参数传入要优化的模型,param_grid参数传入需要遍历的参数网格
grid = GridSearchCV(estimator=model, param_grid=param_grid)
# 使用GridSearchCV进行模型训练和参数优化
# fit方法会遍历param_grid中指定的参数组合,并找到最优的参数组合
grid.fit(X, Y)
# grid.best_score_表示最优参数组合下的最高交叉验证得分
# grid.best_estimator_表示最优模型,我们可以通过它来访问最优参数
print('最高得分: %.2f' % grid.best_score_)
print('最优参数: %s' % grid.best_estimator_.alpha)
#随机搜索优化参数
from pandas import read_csv
from sklearn.linear_model import Ridge # 导入Ridge回归模型
from sklearn.model_selection import RandomizedSearchCV # 导入随机搜索工具
from scipy.stats import uniform # 导入均匀分布,用于定义参数搜索空间
filename = 'pima.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
# 将DataFrame转换为NumPy数组,方便后续操作
array = data.values
X = array[:, 0:8] # 特征矩阵,包含8个特征
Y = array[:, 8] # 目标变量数组
# 初始化Ridge回归模型
model = Ridge()
# 设置遍历的参数分布
# 这里使用uniform分布来定义alpha的可能值范围。uniform(loc=0, scale=1)表示在[0, 1)区间内均匀采样
# 注意:对于Ridge的alpha,我们可能需要一个更宽的范围,比如uniform(loc=0, scale=10)或更大,取决于数据
param_grid = {'alpha': uniform(loc=0, scale=10)} # 根据实际情况调整范围
# 初始化RandomizedSearchCV对象
# estimator参数传入要优化的模型
# param_distributions参数传入参数分布字典
# n_iter参数指定随机采样的次数
# random_state参数用于确保结果的可重复性
grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100, random_state=7)
# 使用RandomizedSearchCV进行模型训练和参数优化
grid.fit(X, Y)
# grid.best_score_表示在随机采样的参数组合中找到的最高交叉验证得分
# grid.best_estimator_表示最优模型,我们可以通过它来访问最优参数
print('最高得分: %.3f' % grid.best_score_) # 保留三位小数以显示更精确的分数
print('最优参数: %s' % grid.best_estimator_.alpha) # 打印出最优的alpha值