机器学习——集成学习进阶

最新推荐文章于 2025-06-06 22:44:25 发布

醋酸洋红就是我

最新推荐文章于 2025-06-06 22:44:25 发布

阅读量259

点赞数

分类专栏：机器学习文章标签：机器学习集成学习人工智能

本文链接：https://blog.csdn.net/qq_40527560/article/details/131431572

版权

机器学习专栏收录该内容

12 篇文章

订阅专栏

XGBoost

极端梯度提升树
在这里插入图片描述

目标函数

在这里插入图片描述

XGBoost回归树构建方法

在这里插入图片描述

XGboost和GDBT的区别

在这里插入图片描述

api介绍

在这里插入图片描述

XGBoost案例

在决策树中的机器学习部分代码进行修改

# 4.xgboost模型训练
# 4.1 初步模型训练
from xgboost import XGBClassifier

xg = XGBClassifier()

xg.fit(x_train, y_train)

xg.score(x_test, y_test)

0.7832699619771863

# 4.2 对max_depth进行调优

depth_range  = range(10)
score = []

for i in depth_range:
    xg = XGBClassifier(eta=1, gamma=0, max_depth=i)
    xg.fit(x_train, y_train)
    
    s = xg.score(x_test, y_test)
    
    print(s)
    score.append(s)

0.6311787072243346
0.7908745247148289
0.7870722433460076
0.7832699619771863
0.7870722433460076
0.7908745247148289
0.7908745247148289
0.7946768060836502
0.7908745247148289
0.7946768060836502

# 4.3 调优结果可视化
import matplotlib.pyplot as plt

plt.plot(depth_range, score)

plt.show()

在这里插入图片描述

otto案例——xgboost实现

在这里插入图片描述

# 通过StratifiedShuffleSplit实现数据分割

from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)

for train_index, test_index in sss.split(X_resampled.values, y_resampled):
    print(len(train_index))
    print(len(test_index))
    
    x_train = X_resampled.values[train_index]
    x_val = X_resampled.values[test_index]
    
    y_train = y_resampled[train_index]
    y_val = y_resampled[test_index]

#数据标准化
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x_train)

x_train_scaled = scaler.transform(x_train)
x_val_scaled = scaler.transform(x_val)

#数据PCA降维
from sklearn.decomposition import PCA

pca = PCA(n_components=0.9)

x_train_pca = pca.fit_transform(x_train_scaled)
x_val_pca = pca.transform(x_val_scaled)

# 可视化数据降维信息变化程度
plt.plot(np.cumsum(pca.explained_variance_ratio_))

plt.xlabel("元素数量")
plt.ylabel("表达信息百分占比")

plt.show()

#基本模型训练
from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.fit(x_train_pca, y_train)

# 输出预测值,一定输出带有百分占比的预测值
y_pre_proba = xgb.predict_proba(x_val_pca)

# logloss评估
from sklearn.metrics import log_loss
log_loss(y_val, y_pre_proba, eps=1e-15, normalize=True)

xgb.get_params

#模型调优
#确定最优的estimators
scores_ne = []
n_estimators = [100, 200, 300, 400, 500, 550, 600, 700]

for nes in n_estimators:
    print("n_estimators:", nes)
    xgb = XGBClassifier(max_depth=3,
                        learning_rate=0.1, 
                        n_estimators=nes, 
                        objective="multi:softprob", 
                        n_jobs=-1, 
                        nthread=4, 
                        min_child_weight=1,
                        subsample=1,
                        colsample_bytree=1,
                        seed=42)
    
    xgb.fit(x_train_pca, y_train)
    y_pre = xgb.predict_proba(x_val_pca)
    score = log_loss(y_val, y_pre)
    scores_ne.append(score)
    
    print("每次测试的logloss值是:{}".format(score))

# 图形化展示相应的logloss值
plt.plot(n_estimators, scores_ne, "o-")

plt.xlabel("n_estimators")
plt.ylabel("log_loss")
plt.show()

print("最优的n_estimators值是:{}".format(n_estimators[np.argmin(scores_ne)]))

#确定最优的max_depth
scores_md = []
max_depths = [1,3,5,6,7]

for md in max_depths:
    print("max_depth:", md)
    xgb = XGBClassifier(max_depth=md,
                        learning_rate=0.1, 
                        n_estimators=n_estimators[np.argmin(scores_ne)], 
                        objective="multi:softprob", 
                        n_jobs=-1, 
                        nthread=4, 
                        min_child_weight=1,
                        subsample=1,
                        colsample_bytree=1,
                        seed=42)
    
    xgb.fit(x_train_pca, y_train)
    y_pre = xgb.predict_proba(x_val_pca)
    score = log_loss(y_val, y_pre)
    scores_md.append(score)
    
    print("每次测试的logloss值是:{}".format(score))

# 图形化展示相应的logloss值
plt.plot(max_depths, scores_md, "o-")

plt.xlabel("max_depths")
plt.ylabel("log_loss")
plt.show()

print("最优的max_depths值是:{}".format(max_depths[np.argmin(scores_md)]))

#确定最佳参数
xgb = XGBClassifier(learning_rate =0.1, 
                    n_estimators=550, 
                    max_depth=3, 
                    min_child_weight=3, 
                    subsample=0.7, 
                    colsample_bytree=0.7, 
                    nthread=4, 
                    seed=42, 
                    objective='multi:softprob')

xgb.fit(x_train_scaled, y_train)

y_pre = xgb.predict_proba(x_val_scaled)

print("测试数据的log_loss值为 : {}".format(log_loss(y_val, y_pre, eps=1e-15, normalize=True)))

lightGBM

主要基于以下方面优化，提升整体特性：
在这里插入图片描述

参数介绍

在这里插入图片描述

调参建议

lightGBM案例

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

#读取数据
iris = load_iris()
data = iris.data
target = iris.target

#数据基本处理
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2)

#模型训练
#模型基本训练
gbm = lgb.LGBMRegressor(objective="regression", learning_rate=0.05, n_estimators=20)

gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric="l1", early_stopping_rounds=3)
gbm.score(X_test, y_test)

在这里插入图片描述

#通过网格搜索进行训练
estimators = lgb.LGBMRegressor(num_leaves=31)
param_grid = {
    "learning_rate": [0.01, 0.1, 1],
    "n_estmators":[20, 40, 60, 80]
}
gbm = GridSearchCV(estimators, param_grid, cv=5)
gbm.fit(X_train, y_train)

在这里插入图片描述

gbm.best_params_

{‘learning_rate’: 0.1, ‘n_estmators’: 20}

gbm = lgb.LGBMRegressor(objective="regression", learning_rate=0.1, n_estimators=20)

gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric="l1", early_stopping_rounds=3)
gbm.score(X_test, y_test)

在这里插入图片描述

《绝地求生》玩家排名预测

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
train = pd.read_csv("./data/train_V2.csv")

#数据基本处理
#数据缺失值处理
# 判断哪列有缺失值,发现只有winPlacePerc有 
np.any(train.isnull())

# 查找缺失值
train[train["winPlacePerc"].isnull()]

# 删除
train = train.drop(2744604)

#特征数据规范化处理
#查看每场比赛参加的人数
count = train.groupby("matchId")["matchId"].transform("count")

train["playersJoined"] = count
train["playersJoined"].sort_values()

plt.figure(figsize=(20, 8))
sns.countplot(train["playersJoined"])
plt.grid()
plt.show()

在这里插入图片描述

plt.figure(figsize=(20, 8))
sns.countplot(train[train["playersJoined"]>=75]["playersJoined"])
plt.grid()
plt.show()

在这里插入图片描述

#规范化输出部分数据
train["killsNorm"] = train["kills"] * ((100-train["playersJoined"])/100+1)
train["damageDealtNorm"] = train["damageDealt"] * ((100-train["playersJoined"])/100+1)
train["maxPlaceNorm"] = train["maxPlace"] * ((100-train["playersJoined"])/100+1)
train["matchDurationNorm"] = train["matchDuration"] * ((100-train["playersJoined"])/100+1)

# 比较经过规范化的特征值和原始特征值的值
to_show = ['Id', 'kills','killsNorm','damageDealt', 'damageDealtNorm', 'maxPlace', 'maxPlaceNorm', 'matchDuration', 'matchDurationNorm']
train[to_show][0:11]

#部分变量合成
train["healsandboosts"] = train["heals"] + train["boosts"]

#异常值处理
#异常值处理：删除有击杀，但是完全没有移动的玩家
train["totalDistance"] = train["rideDistance"] + train["walkDistance"] + train["swimDistance"]
train["killwithoutMoving"] = (train["kills"] > 0) & (train["totalDistance"] == 0)
train.drop(train[train["killwithoutMoving"] == True].index, inplace=True)

#异常值处理：删除驾车杀敌数异常的数据
train.drop(train[train["roadKills"] > 10].index, inplace=True)

#异常值处理：删除玩家在一局中杀敌数超过30人的数据
train.drop(train[train["kills"] > 30].index, inplace=True)

#异常值处理：删除爆头率异常数据
train["headshot_rate"] = train["headshotKills"]/train["kills"]
train["headshot_rate"] = train["headshot_rate"].fillna(0)
train.drop(train[(train["headshot_rate"] == 1) & (train["kills"] > 9)].index, inplace=True)

#异常值处理：删除最远杀敌距离异常数据
train.drop(train[train["longestKill"] >=1000].index, inplace=True)

#异常值处理：删除关于运动距离的异常值
train.drop(train[train["walkDistance"] >=10000].index, inplace=True)
train.drop(train[train["rideDistance"] >=20000].index, inplace=True)
train.drop(train[train["swimDistance"] >=20000].index, inplace=True)

#异常值处理：武器收集异常值处理
train.drop(train[train["weaponsAcquired"] >=80].index, inplace=True)

#异常值处理：删除使用治疗药品数量异常值
train.drop(train[train["heals"] >=80].index, inplace=True)

#类别型数据处理
#比赛类型one-hot处理
train["matchType"].unique()
train = pd.get_dummies(train, columns=["matchType"])
matchType_encoding = train.filter(regex="matchType")

#对groupId,matchId等数据进行处理
train["groupId"] = train["groupId"].astype("category")
train["groupId_cat"] = train["groupId"].cat.codes
train["matchId"] = train["matchId"].astype("category")
train["matchId_cat"] = train["matchId"].cat.codes
train.drop(["groupId", "matchId"], axis=1, inplace=True)

#数据截取
#取部分数据进行使用（100000）
df_sample = train.sample(100000)

#确定特征值和目标值
df = df_sample.drop(["winPlacePerc", "Id"], axis=1)

y = df_sample["winPlacePerc"]

#分割训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(df, y, test_size=0.2)

#机器学习（模型训练）和评估
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

#使用随机森林对模型进行训练
#初步使用随机森林进行模型训练
m1 = RandomForestRegressor(n_estimators=40, 
                           min_samples_leaf=3, 
                           max_features='sqrt',
                           n_jobs=-1)

m1.fit(X_train, y_train)

y_pre = m1.predict(X_valid)
m1.score(X_valid, y_valid)
mean_absolute_error(y_valid, y_pre)

#再次使用随机森林，进行模型训练
m1.feature_importances_
imp_df = pd.DataFrame({"cols":df.columns, "imp":m1.feature_importances_})
imp_df = imp_df.sort_values("imp", ascending=False)
imp_df[:20].plot("cols", "imp", figsize=(20, 8), kind="barh")

在这里插入图片描述

to_keep = imp_df[imp_df.imp > 0.005].cols
df_keep = df[to_keep]
X_train, X_valid, y_train, y_valid = train_test_split(df_keep, y, test_size=0.2)
m2 = RandomForestRegressor(n_estimators=40, 
                           min_samples_leaf=3, 
                           max_features='sqrt',
                           n_jobs=-1)

m2.fit(X_train, y_train)

y_pre = m2.predict(X_valid)
m2.score(X_valid, y_valid)
mean_absolute_error(y_valid, y_pre)

#使用lightGBM对模型进行训练
X_train, X_valid, y_train, y_valid = train_test_split(df, y, test_size=0.2)

#模型初次尝试
import lightgbm as lgb
gbm = lgb.LGBMRegressor(objective="regression", num_leaves=31, learning_rate=0.05, n_estimators=20)

gbm.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric="l1", early_stopping_rounds=5)

y_pre = gbm.predict(X_valid, num_iteration=gbm.best_iteration_)
mean_absolute_error(y_valid, y_pre)

#模型二次调优
from sklearn.model_selection import GridSearchCV
estimator = lgb.LGBMRegressor(num_leaves=31)
param_grid = {
    "learning_rate":[0.01, 0.1, 1],
    "n_estimators":[40, 60, 80, 100, 200, 300]
}

gbm = GridSearchCV(estimator, param_grid, cv=5, n_jobs=-1)

gbm.fit(X_train, y_train)

y_pre = gbm.predict(X_valid)
mean_absolute_error(y_valid, y_pre)
gbm.best_params_

#模型三次调优
# n_estimators

scores = []
n_estimators = [100, 300, 500, 800]

for nes in  n_estimators:
    lgbm = lgb.LGBMRegressor(boosting_type='gbdt', 
                      num_leaves=31,
                      max_depth=5,
                      learning_rate=0.1,
                      n_estimators=nes,
                      min_child_samples=20,
                      n_jobs=-1)
    
    lgbm.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric="l1", early_stopping_rounds=5)
    
    y_pre = lgbm.predict(X_valid)
    
    mae = mean_absolute_error(y_valid, y_pre)
    
    scores.append(mae)
    print("本次结果输出的mae值是:\n", mae)

plt.plot(n_estimators,scores,'o-')
plt.ylabel("mae")
plt.xlabel("n_estimator")
print("best n_estimator {}".format(n_estimators[np.argmin(scores)]))

在这里插入图片描述

# max_depth

scores = []
max_depth = [3, 5, 7, 9, 11]

for md in  max_depth:
    lgbm = lgb.LGBMRegressor(boosting_type='gbdt', 
                      num_leaves=31,
                      max_depth=md,
                      learning_rate=0.1,
                      n_estimators=500,
                      min_child_samples=20,
                      n_jobs=-1)
    
    lgbm.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric="l1", early_stopping_rounds=5)
    
    y_pre = lgbm.predict(X_valid)
    
    mae = mean_absolute_error(y_valid, y_pre)
    
    scores.append(mae)
    print("本次结果输出的mae值是:\n", mae)

plt.plot(max_depth,scores,'o-')
plt.ylabel("mae")
plt.xlabel("max_depths")
print("best max_depths {}".format(max_depth[np.argmin(scores)]))

在这里插入图片描述

scores

[0.058867698663447106,
0.0566209902947507,
0.05695850296967709,
0.057414793402343275,
0.0571923061736829]

机器学习——集成学习进阶

目录

XGBoost

目标函数

XGBoost回归树构建方法

XGboost和GDBT的区别

api介绍

XGBoost案例

otto案例——xgboost实现

lightGBM

参数介绍

lightGBM案例

《绝地求生》玩家排名预测