超参数调整专题1
知识点回顾
1.网格搜索
2.随机搜索(简单介绍,非重点 实战中很少用到,可以不了解)
3.贝叶斯优化(2种实现逻辑,以及如何避开必须用交叉验证的问题)
4.time库的计时模块,方便后人查看代码运行时长
今日作业:对于信贷数据的其他模型,如LightGBM和KNN 尝试用下贝叶斯优化和网格搜索
导入数据
import pandas as pd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# 设置中文字体(解决中文显示问题)
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
data = pd.read_csv('data.csv')
对离散特征进行编码
discrete_features = data.select_dtypes(include=['object']).columns.tolist()
home_ownership_mapping = {
'Own Home': 1,
'Rent': 2,
'Have Mortgage': 3,
'Home Mortgage': 4
}
data['Home Ownership'] = data['Home Ownership'].map(home_ownership_mapping)
years_in_job_mapping = {
'< 1 year': 1,
'1 year': 2,
'2 years': 3,
'3 years': 4,
'4 years': 5,
'5 years': 6,
'6 years': 7,
'7 years': 8,
'8 years': 9,
'9 years': 10,
'10+ years': 11
}
data['Years in current job'] = data['Years in current job'].map(years_in_job_mapping)
data = pd.get_dummies(data, columns=['Purpose'])
data2 = pd.read_csv("data.csv") # 重新读取数据,用来做列名对比
list_final = [] # 新建一个空列表,用于存放独热编码后新增的特征名
for i in data.columns:
if i not in data2.columns:
list_final.append(i) # 这里打印出来的就是独热编码后的特征名
for i in list_final:
data[i] = data[i].astype(int) # 这里的i就是独热编码后的特征名
term_mapping = {
'Short Term': 0,
'Long Term': 1
}
data['Term'] = data['Term'].map(term_mapping)
data.rename(columns={'Term': 'Long Term'}, inplace=True) # 重命名列
continuous_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist() #把筛选出来的列名转换成列表
# 连续特征用中位数补全
for feature in continuous_features:
mode_value = data[feature].mode()[0] #获取该列的众数。
data[feature].fillna(mode_value, inplace=True) #用众数填充该列的缺失值,inplace=True表示直接在原数据上修改。
划分数据集
from sklearn.model_selection import train_test_split
X = data.drop(['Credit Default'], axis=1) # 特征,axis=1表示按列删除
y = data['Credit Default'] # 标签
# 按照8:2划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 80%训练集,20%测试集
from sklearn.neighbors import KNeighborsClassifier #K近邻分类器
import lightgbm as lgb #LightGBM分类器
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # 用于评估分类器性能的指标
from sklearn.metrics import classification_report, confusion_matrix #用于生成分类报告和混淆矩阵
import warnings #用于忽略警告信息
warnings.filterwarnings("ignore") # 忽略所有警告信息
模型训练
KNN
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
knn_pred = knn_model.predict(X_test)
print("\nKNN 分类报告:")
print(classification_report(y_test, knn_pred))
print("KNN 混淆矩阵:")
print(confusion_matrix(y_test, knn_pred))
knn_accuracy = accuracy_score(y_test, knn_pred)
knn_precision = precision_score(y_test, knn_pred)
knn_recall = recall_score(y_test, knn_pred)
knn_f1 = f1_score(y_test, knn_pred)
print("KNN 模型评估指标:")
print(f"准确率: {knn_accuracy:.4f}")
print(f"精确率: {knn_precision:.4f}")
print(f"召回率: {knn_recall:.4f}")
print(f"F1 值: {knn_f1:.4f}")
LightGBM
lgb_model = lgb.LGBMClassifier(random_state=42)
lgb_model.fit(X_train, y_train)
lgb_pred = lgb_model.predict(X_test)
print("\nLightGBM 分类报告:")
print(classification_report(y_test, lgb_pred))
print("LightGBM 混淆矩阵:")
print(confusion_matrix(y_test, lgb_pred))
lgb_accuracy = accuracy_score(y_test, lgb_pred)
lgb_precision = precision_score(y_test, lgb_pred)
lgb_recall = recall_score(y_test, lgb_pred)
lgb_f1 = f1_score(y_test, lgb_pred)
print("LightGBM 模型评估指标:")
print(f"准确率: {lgb_accuracy:.4f}")
print(f"精确率: {lgb_precision:.4f}")
print(f"召回率: {lgb_recall:.4f}")
print(f"F1 值: {lgb_f1:.4f}")
调参
网格搜索
KNN
print("\n--- 2. 网格搜索优化KNN (训练集 -> 测试集) ---")
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
# 定义正确的 KNN 参数网格
param_grid = {
'n_neighbors': [3, 5, 7, 9], # 邻居数量
'weights': ['uniform', 'distance'], # 权重计算方式
'p': [1, 2], # 距离度量(1=曼哈顿距离,2=欧氏距离)
'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}
# 创建网格搜索对象
grid_search = GridSearchCV(
estimator=KNeighborsClassifier(),
param_grid=param_grid,
cv=5,
n_jobs=-1,
scoring='accuracy'
)
# 执行网格搜索
grid_search.fit(X_train, y_train)
# 输出最佳参数和评估结果
print("最佳参数: ", grid_search.best_params_)
best_model = grid_search.best_estimator_
best_pred = best_model.predict(X_test)
print("\n网格搜索优化后的 KNN 在测试集上的分类报告:")
print(classification_report(y_test, best_pred))
print("网格搜索优化后的 KNN 在测试集上的混淆矩阵:")
print(confusion_matrix(y_test, best_pred))
LightGBM
print("\n--- 网格搜索优化 LightGBM (训练集 -> 测试集) ---")
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix
import time
# 定义 LightGBM 的参数网格(仅包含 LightGBM 支持的参数)
param_grid = {
'num_leaves': [20, 30, 40], # 叶子节点数(控制模型复杂度)
'max_depth': [3, 5, 7], # 树的最大深度
'learning_rate': [0.05, 0.1], # 学习率
'n_estimators': [50, 100, 200], # 树的数量
'subsample': [0.8, 1.0], # 样本子采样比例(防止过拟合)
'feature_fraction': [0.8, 1.0] # 特征子采样比例(防止过拟合)
}
# 初始化 LightGBM 模型(设置分类任务和随机种子)
lgb_model = LGBMClassifier(
objective='binary', # 二分类任务(如果是多分类,设为 'multiclass')
random_state=42
)
# 创建网格搜索对象
grid_search = GridSearchCV(
estimator=lgb_model,
param_grid=param_grid,
cv=5, # 5折交叉验证
n_jobs=-1, # 使用所有 CPU 核心
scoring='accuracy' # 优化目标(可改为 'f1', 'precision' 等)
)
# 执行网格搜索
start_time = time.time()
grid_search.fit(X_train, y_train) # 必须调用 fit!!!
end_time = time.time()
print(f"网格搜索耗时: {end_time - start_time:.2f} 秒")
# 输出最佳参数
print("最佳参数: ", grid_search.best_params_)
# 使用最佳模型预测
best_lgb = grid_search.best_estimator_
y_pred = best_lgb.predict(X_test)
# 评估结果
print("\n网格搜索优化后的 LightGBM 分类报告:")
print(classification_report(y_test, y_pred))
print("网格搜索优化后的 LightGBM 混淆矩阵:")
print(confusion_matrix(y_test, y_pred))
贝叶斯优化
KNN
print("\n--- 贝叶斯优化 KNN (训练集 -> 测试集) ---")
from skopt import BayesSearchCV
from skopt.space import Integer, Categorical
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
import time
# 定义贝叶斯优化的参数空间(仅包含 KNN 支持的参数)
search_space = {
'n_neighbors': Integer(3, 20), # 邻居数范围: 3~20
'weights': Categorical(['uniform', 'distance']), # 权重类型
'p': Integer(1, 2), # 距离度量: 1 (曼哈顿) 或 2 (欧氏)
'algorithm': Categorical(['auto', 'kd_tree', 'ball_tree', 'brute'])
}
# 创建贝叶斯优化对象
bayes_search = BayesSearchCV(
estimator=KNeighborsClassifier(),
search_spaces=search_space,
n_iter=32, # 迭代次数(建议至少 20~50 次)
cv=5, # 5折交叉验证
n_jobs=-1, # 使用所有 CPU 核心
scoring='accuracy', # 优化目标(可改为 'f1'、'precision' 等)
random_state=42 # 随机种子(确保可复现)
)
# 执行贝叶斯优化
start_time = time.time()
bayes_search.fit(X_train, y_train) # 必须调用 fit!!!
end_time = time.time()
print(f"贝叶斯优化耗时: {end_time - start_time:.2f} 秒")
# 输出最佳参数
print("最佳参数: ", bayes_search.best_params_)
# 使用最佳模型预测
best_knn = bayes_search.best_estimator_
y_pred = best_knn.predict(X_test)
# 评估结果
print("\n贝叶斯优化后的 KNN 分类报告:")
print(classification_report(y_test, y_pred))
print("贝叶斯优化后的 KNN 混淆矩阵:")
print(confusion_matrix(y_test, y_pred))
LightGBM
print("\n--- 贝叶斯优化 LightGBM (训练集 -> 测试集) ---")
from skopt import BayesSearchCV
from skopt.space import Integer, Real, Categorical
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix
import time
# 定义贝叶斯优化的参数空间(LightGBM 支持的参数)
search_space = {
'num_leaves': Integer(10, 100), # 叶子节点数(控制模型复杂度)
'max_depth': Integer(3, 10), # 树的最大深度(与 num_leaves 配合使用)
'learning_rate': Real(0.01, 0.3, 'log-uniform'), # 学习率(对数均匀分布)
'n_estimators': Integer(50, 500), # 树的数量
'subsample': Real(0.6, 1.0), # 样本子采样比例(防止过拟合)
'feature_fraction': Real(0.6, 1.0), # 特征子采样比例(防止过拟合)
'reg_alpha': Real(0, 1.0), # L1 正则化权重
'reg_lambda': Real(0, 1.0) # L2 正则化权重
}
# 初始化 LightGBM 模型(设置分类任务和随机种子)
lgb_model = LGBMClassifier(
objective='binary', # 二分类任务(多分类设为 'multiclass' 并指定 num_class)
random_state=42,
verbosity=-1 # 关闭 LightGBM 的训练日志
)
# 创建贝叶斯优化对象
bayes_search = BayesSearchCV(
estimator=lgb_model,
search_spaces=search_space,
n_iter=50, # 迭代次数(建议至少 50~100 次)
cv=5, # 5折交叉验证
n_jobs=-1, # 使用所有 CPU 核心
scoring='accuracy', # 优化目标(可改为 'f1', 'roc_auc' 等)
random_state=42 # 确保可复现性
)
# 执行贝叶斯优化
start_time = time.time()
bayes_search.fit(X_train, y_train) # 必须调用 fit!!!
end_time = time.time()
print(f"贝叶斯优化耗时: {end_time - start_time:.2f} 秒")
# 输出最佳参数
print("最佳参数: ", bayes_search.best_params_)
# 使用最佳模型预测
best_lgb = bayes_search.best_estimator_
y_pred = best_lgb.predict(X_test)
# 评估结果
print("\n贝叶斯优化后的 LightGBM 分类报告:")
print(classification_report(y_test, y_pred))
print("贝叶斯优化后的 LightGBM 混淆矩阵:")
print(confusion_matrix(y_test, y_pred))
截图最佳参数显示不全,重新写一下
最佳参数: OrderedDict([('feature_fraction', 1.0), ('learning_rate', 0.01), ('max_depth', 10), ('n_estimators', 500), ('num_leaves', 10), ('reg_alpha', 1.0), ('reg_lambda', 1.0), ('subsample', 0.6)])
@浙大疏锦行