1.数据处理
import pandas as pd
from sklearn.model_selection import train_test_split
flights = pd.read_csv('flights.csv')
flights = flights.sample(frac=0.01, random_state=10)
flights = flights[["MONTH", "DAY", "DAY_OF_WEEK", "AIRLINE",
"FLIGHT_NUMBER","DESTINATION_AIRPORT", "ORIGIN_AIRPORT","AIR_TIME",
"DEPARTURE_TIME", "DISTANCE", "ARRIVAL_DELAY"]]
flights["ARRIVAL_DELAY"] = (flights["ARRIVAL_DELAY"]>10)*1
cat_cols = ["AIRLINE", "FLIGHT_NUMBER", "DESTINATION_AIRPORT",
"ORIGIN_AIRPORT"]
for item in cat_cols:
flights[item] = flights[item].astype("category").cat.codes +1
X_train, X_test, y_train, y_test = train_test_split(
flights.drop(["ARRIVAL_DELAY"], axis=1),
flights["ARRIVAL_DELAY"],
random_state=10, test_size=0.3)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
2.建模
import lightgbm as lgb
dtrain = lgb.Dataset(X_train, label=y_train)
params = {
"max_depth": 5,
"learning_rate" : 0.05,
"num_leaves": 500,
"n_estimators": 300
}
cate_features_name = ["MONTH","DAY","DAY_OF_WEEK","AIRLINE",
"DESTINATION_AIRPORT", "ORIGIN_AIRPORT"]
model_lgb = lgb.train(params, d_train,
categorical_feature = cate_features_name)
y_pred = model_lgb.predict(X_test)
print('AUC of testset based on XGBoost: 'roc_auc_score(y_test, y_pred))
3. 示例
import sys
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor,XGBRFRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import BayesianRidge, MultiTaskElasticNetCV, ElasticNetCV, HuberRegressor, TheilSenRegressor, \
RANSACRegressor
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.metrics import mean_absolute_error
X, y = make_regression(n_samples=1000, n_features=5, n_informative=5, n_targets=2, random_state=1)
X2 = X[:-20]
y2 = y[:-20]
print(X.shape,y.shape)
def get_scaler(data):
scaler = MinMaxScaler()
scaler.fit(data)
return scaler
model = MultiOutputRegressor(LGBMRegressor())
"""
RANSAC会更好地处理 y 方向上的大异常值(最常见的情况)。
Theil Sen会更好地应对 X 方向的中等大小异常值,但在高维设置中这个属性会消失。
"""
model.fit(X2, y2)
print(model.get_params())
ss = model.get_params()
for k,v in ss.items():
if v is None:
continue
if type(v) not in [str,float,int]:
ss[k]=str(v)
print(ss)
data_in = [[-2.02220122, 0.31563495, 0.82797464, -0.30620401, 0.16003707, -1.44411381,
0.87616892, -0.50446586, 0.23009474, 0.76201118]]
new_X = X[-10:]
yhat = model.predict(new_X)
print("----------预测值-----------")
print(yhat)
print("----------真实值-----------")
print(y[-10:])
print("----------评估-----------")
print(mean_absolute_error(y[-10:],yhat))