Python打卡 DAY 11-CSDN博客

本文链接：https://blog.csdn.net/2401_86098690/article/details/147641220
超参数调整专题1
知识点回顾
1. 网格搜索
2. 随机搜索（简单介绍，非重点实战中很少用到，可以不了解）
3. 贝叶斯优化（2种实现逻辑，以及如何避开必须用交叉验证的问题）
4. time库的计时模块，方便后人查看代码运行时长
今日作业：
对于信贷数据的其他模型，如LightGBM和KNN 尝试用下贝叶斯优化和网格搜索
import pandas as pd    
import numpy as np     
import matplotlib.pyplot as plt   
import seaborn as sns  

plt.rcParams['font.sans-serif'] = ['SimHei'] 
plt.rcParams['axes.unicode_minus'] = False    
data  = pd.read_csv('data.csv')

discrete_features = data.select_dtypes(include=['object']).columns.tolist()

home_ownership_mapping = {
    'Own Home': 1,
    'Rent': 2,
    'Have Mortgage': 3,
    'Home Mortgage': 4
}
data['Home Ownership'] = data['Home Ownership'].map(home_ownership_mapping)


years_in_job_mapping = {
    '< 1 year': 1,
    '1 year': 2,
    '2 years': 3,
    '3 years': 4,
    '4 years': 5,
    '5 years': 6,
    '6 years': 7,
    '7 years': 8,
    '8 years': 9,
    '9 years': 10,
    '10+ years': 11
}
data['Years in current job'] = data['Years in current job'].map(years_in_job_mapping)


data = pd.get_dummies(data, columns=['Purpose'])
data2 = pd.read_csv("data.csv") 
list_final = [] 
for i in data.columns:
    if i not in data2.columns:
       list_final.append(i) 
for i in list_final:
    data[i] = data[i].astype(int)


term_mapping = {
    'Short Term': 0,
    'Long Term': 1
}
data['Term'] = data['Term'].map(term_mapping)
data.rename(columns={'Term': 'Long Term'}, inplace=True) # 重命名列
continuous_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()  
 
for feature in continuous_features:     
    mode_value = data[feature].mode()[0]         
    data[feature].fillna(mode_value, inplace=True)    
 
# 划分训练集、验证集和测试集，因为需要考2次
# 这里演示一下如何2次划分数据集，因为这个函数只能划分一次，所以需要调用两次才能划分出训练集、验证集和测试集。
from sklearn.model_selection import train_test_split
X = data.drop(['Credit Default'], axis=1)  # 特征，axis=1表示按列删除
y = data['Credit Default']  # 标签
# 按照8:1:1划分训练集、验证集和测试集
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)  # 80%训练集，20%临时集
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)  # 50%验证集，50%测试集
import lightgbm as lgb #LightGBM分类器
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # 用于评估分类器性能的指标
from sklearn.metrics import classification_report, confusion_matrix #用于生成分类报告和混淆矩阵
import warnings #用于忽略警告信息
import time
warnings.filterwarnings("ignore") # 忽略所有警告信息
print("--- 1. 默认参数LightGBM (训练集 -> 测试集) ---")
start_time = time.time() # 记录开始时间
lgb_model = lgb.LGBMClassifier(random_state=42)
lgb_model.fit(X_train, y_train) # 在训练集上训练
lgb_pred = lgb_model.predict(X_test) # 在测试集上预测
end_time = time.time() # 记录结束时间
print(f"训练与预测耗时: {end_time - start_time:.4f} 秒")
print("\n默认随机森林 在测试集上的分类报告：")
print(classification_report(y_test, rf_pred))
print("默认随机森林 在测试集上的混淆矩阵：")
print(confusion_matrix(y_test, rf_pred))
print("\n--- 2. 网格搜索优化LightGBM (训练集 -> 测试集) ---")
from sklearn.model_selection import GridSearchCV
# 定义要搜索的参数网格
param_grid = {
    'n_estimators': [50, 100, 200],  # 树的数量
    'max_depth': [-1, 10, 20, 30],  # 树的最大深度，-1表示不限制
    'num_leaves': [20, 31, 40],  # 树的最大叶子节点数
    'learning_rate': [0.01, 0.1, 0.2],  # 学习率
}
# 创建网格搜索对象
grid_search = GridSearchCV(estimator=lgb.LGBMClassifier(random_state=42), # LightGBM分类器
param_grid=param_grid, # 参数网格
cv=5, # 5折交叉验证
n_jobs=-1, # 使用所有可用的CPU核心进行并行计算
scoring='accuracy') # 使用准确率作为评分标准
start_time = time.time()
# 在训练集上进行网格搜索
grid_search.fit(X_train, y_train) # 在训练集上训练，模型实例化和训练的方法都被封装在这个网格搜索对象里了
end_time = time.time()
print(f"网格搜索耗时: {end_time - start_time:.4f} 秒")
print("最佳参数: ", grid_search.best_params_) #best_params_属性返回最佳参数组合
# 使用最佳参数的模型进行预测
best_model = grid_search.best_estimator_ # 获取最佳模型
best_pred = best_model.predict(X_test) # 在测试集上进行预测
print("\n网格搜索优化后的LightGBM分类器 在测试集上的分类报告：")
print(classification_report(y_test, best_pred))
print("网格搜索优化后的LightGBM分类器 在测试集上的混淆矩阵：")
print(confusion_matrix(y_test, best_pred))
# --- 2. 贝叶斯优化LightGBM分类器 ---
print("\n--- 2. 贝叶斯优化随机森林 (训练集 -> 测试集) ---")
from skopt import BayesSearchCV
from skopt.space import Integer, Real
# 定义要搜索的参数空间
search_space = {
    'n_estimators': Integer(50, 200),  # 树的数量
    'max_depth': Integer(-1, 30),  # 树的最大深度，-1表示不限制
    'num_leaves': Integer(20, 50),  # 树的最大叶子节点数
    'learning_rate': Real(0.01, 0.2, prior='log-uniform'),  # 学习率，对数均匀分布
}
# 创建贝叶斯优化搜索对象
bayes_search = BayesSearchCV(
    estimator=lgb.LGBMClassifier(random_state=42),
    search_spaces=search_space,
    n_iter=32,  # 迭代次数，可根据需要调整
    cv=5, # 5折交叉验证，这个参数是必须的，不能设置为1，否则就是在训练集上做预测了
    n_jobs=-1,
    scoring='accuracy'
)
start_time = time.time()
# 在训练集上进行贝叶斯优化搜索
bayes_search.fit(X_train, y_train)
end_time = time.time()
print(f"贝叶斯优化耗时: {end_time - start_time:.4f} 秒")
print("最佳参数: ", bayes_search.best_params_)
# 使用最佳参数的模型进行预测
best_model = bayes_search.best_estimator_
best_pred = best_model.predict(X_test)
print("\n贝叶斯优化后的LightGBM分类器 在测试集上的分类报告：")
print(classification_report(y_test, best_pred))
print("贝叶斯优化后的LightGBM分类器 在测试集上的混淆矩阵：")
print(confusion_matrix(y_test, best_pred))
@浙大疏锦行