LightGBM 简单实现

1.数据处理

# 导入pandas和sklearn数据划分模块
import pandas as pd
from sklearn.model_selection import train_test_split
# 读取flights数据集
flights = pd.read_csv('flights.csv')
# 数据集抽样1%
flights = flights.sample(frac=0.01, random_state=10)
# 特征抽样,获取指定的11个特征
flights = flights[["MONTH", "DAY", "DAY_OF_WEEK", "AIRLINE",
"FLIGHT_NUMBER","DESTINATION_AIRPORT", "ORIGIN_AIRPORT","AIR_TIME",
"DEPARTURE_TIME", "DISTANCE", "ARRIVAL_DELAY"]]
# 对标签进行离散化,延误10分钟以上才算延误
flights["ARRIVAL_DELAY"] = (flights["ARRIVAL_DELAY"]>10)*1
# 类别特征
cat_cols = ["AIRLINE", "FLIGHT_NUMBER", "DESTINATION_AIRPORT",
"ORIGIN_AIRPORT"]
# 类别特征编码
for item in cat_cols:
      flights[item] = flights[item].astype("category").cat.codes +1
  # 数据集划分
X_train, X_test, y_train, y_test = train_test_split(
      flights.drop(["ARRIVAL_DELAY"], axis=1),
      flights["ARRIVAL_DELAY"], 
      random_state=10, test_size=0.3)
# 打印划分后的数据集大小
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

2.建模

# 导入lightgbm模块
import lightgbm as lgb
dtrain = lgb.Dataset(X_train, label=y_train)
params = {
"max_depth": 5, 
"learning_rate" : 0.05, 
"num_leaves": 500,  
"n_estimators": 300
}

# 指定类别特征
cate_features_name = ["MONTH","DAY","DAY_OF_WEEK","AIRLINE",
"DESTINATION_AIRPORT", "ORIGIN_AIRPORT"]
  # lightgbm模型拟合
model_lgb = lgb.train(params, d_train, 
categorical_feature = cate_features_name)
  # 对测试集进行预测
y_pred = model_lgb.predict(X_test)
print('AUC of testset based on XGBoost: 'roc_auc_score(y_test, y_pred))

3. 示例

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# time: 2023/3/11 9:33
# file: test.py

# 代码示例
import sys

from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor,XGBRFRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import BayesianRidge, MultiTaskElasticNetCV, ElasticNetCV, HuberRegressor, TheilSenRegressor, \
    RANSACRegressor
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.metrics import mean_absolute_error
# 创建数据
X, y = make_regression(n_samples=1000, n_features=5, n_informative=5, n_targets=2, random_state=1)
X2 = X[:-20]
y2 = y[:-20]
print(X.shape,y.shape)

def get_scaler(data):
    scaler = MinMaxScaler()
    scaler.fit(data)
    return scaler
# scaler = get_scaler(data=X2)
# X2 = scaler.transform(X2)

# GBDT【】
# model = MultiOutputRegressor(GradientBoostingRegressor())

# model = MultiOutputRegressor(AdaBoostRegressor())
# LGBM【】
model = MultiOutputRegressor(LGBMRegressor())

# 支持向量机回归
# model = MultiOutputRegressor(SVR())
# MLP神经网络(线性)
# model = MLPRegressor()
# 贝叶斯回归【】
# model = MultiOutputRegressor(BayesianRidge())
# 弹性网【】
# model = MultiTaskElasticNetCV(cv=5)
# model = MultiOutputRegressor(ElasticNetCV(cv=5))#比上面这个更好一点

# 稳健回归【适合异常】
# model = MultiOutputRegressor(HuberRegressor())
# model = MultiOutputRegressor(TheilSenRegressor())
# model = RANSACRegressor()
"""
RANSAC会更好地处理 y 方向上的大异常值(最常见的情况)。
Theil Sen会更好地应对 X 方向的中等大小异常值,但在高维设置中这个属性会消失。
"""

# 随机森林
# model = RandomForestRegressor()
# model = XGBRFRegressor()
# XGBOOST模型
# model = XGBRegressor()



# 训练模型
model.fit(X2, y2)
print(model.get_params())
ss = model.get_params()
for k,v in ss.items():
    if v is None:
        continue
    if type(v) not in [str,float,int]:
        ss[k]=str(v)
print(ss)
# 使用模型进行预测
data_in = [[-2.02220122, 0.31563495, 0.82797464, -0.30620401, 0.16003707, -1.44411381,
            0.87616892, -0.50446586, 0.23009474, 0.76201118]]
new_X = X[-10:]
# new_X = scaler.transform(new_X)
yhat = model.predict(new_X)
# 预测结果的汇总
print("----------预测值-----------")
print(yhat)
print("----------真实值-----------")
print(y[-10:])
print("----------评估-----------")
print(mean_absolute_error(y[-10:],yhat))
  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Tony Einstein

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值