LightGBM简单使用

# pip install lightgbm==2.1.2

import lightgbm as lgb
import pandas as pd
from pandas import DataFrame
import gc
from sklearn.model_selection import train_test_split
from matplotlib import pyplot  

# 通过设置字段类型及手动gc优化内存占用,1.22g数据训练时占用内存2.2g
# string类型字段重复较多时可设置为category
column_types={"XH":'category',"A":'float32',"B":'float32',"C":'float32',"D":'float32'}
path = 'data.csv'
df = pd.read_csv(path, dtype=column_types,encoding='gbk')

# 预测字段y,特征字段X,分组字段xh
y = df.iloc[:, -1]
X = df.iloc[:, 0:-1]
xh = df.iloc[:, 0]

del df
gc.collect()

# 训练数据与测试数据中xh所占比例一致
X_datat, X_datav, y_train, y_valid = train_test_split(X, y, test_size=0.25, stratify=xh)

del y
del X
del xh
gc.collect()

X_train = X_datat.iloc[:, 1:]
X_valid = X_datav.iloc[:, 1:]
X_valid_xh = X_datav.iloc[:, 0]

del X_datat
del X_datav
gc.collect()

# 线程:num_threads
gbm = lgb.LGBMRegressor(objective='regression',learning_rate=0.3,n_estimators=50,num_threads=8)
# num_leaves 31,max_bin 255,histogram_pool_size -1:通过减小此值来降低内存占用
#gbm = lgb.LGBMRegressor(objective='regression',num_leaves=31,learning_rate=0.3,n_estimators=100,num_threads=2,histogram_pool_size=512,max_bin=25,two_round=true)
# l2--mse, l1--mae
gbm.fit(X_train, y_train,eval_set=[(X_valid, y_valid)],eval_metric='l1',early_stopping_rounds=50, verbose=True)

# 绘制评价曲线 
results = gbm.evals_result_ 
epochs = len(results['valid_0']['l1'])  
x_axis = range(0, epochs)  

fig, ax = pyplot.subplots()  
ax.plot(x_axis, results['valid_0']['l1'], label='Test')  
ax.legend()  
pyplot.ylabel('Regres mae')  
pyplot.title('lightgbm Regressor mae')  
pyplot.show()  

# 特征重要性
print('Feature importances:', list(gbm.feature_importances_))

# 保存与加载模型
import pickle
pickle.dump(gbm, open("1001.model", "wb"))
gbm = pickle.load(open("1001.model", "rb"))

print('Start predicting...')
# 预测
valid_pre = gbm.predict(X_valid, num_iteration=gbm.best_iteration_)
redf = DataFrame(valid_pre, columns=['pre'])
l=pd.concat( [X_valid_xh.reset_index(drop=True), y_valid.reset_index(drop=True), redf], axis=1 )
l.to_csv('result_3.csv', index=False)

交叉验证

estimator = lgb.LGBMRegressor(num_leaves=31) 
param_grid = {    'learning_rate': [0.01, 0.1, 1],    'n_estimators': [20, 40]} 
gbm = GridSearchCV(estimator, param_grid) 
gbm.fit(X_train, y_train) 
print('Best parameters found by grid search are:', gbm.best_params_)

调参参考:https://www.cnblogs.com/bjwu/p/9307344.html

  • 1
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值