# pip install lightgbm==2.1.2
import lightgbm as lgb
import pandas as pd
from pandas import DataFrame
import gc
from sklearn.model_selection import train_test_split
from matplotlib import pyplot
# 通过设置字段类型及手动gc优化内存占用,1.22g数据训练时占用内存2.2g
# string类型字段重复较多时可设置为category
column_types={"XH":'category',"A":'float32',"B":'float32',"C":'float32',"D":'float32'}
path = 'data.csv'
df = pd.read_csv(path, dtype=column_types,encoding='gbk')
# 预测字段y,特征字段X,分组字段xh
y = df.iloc[:, -1]
X = df.iloc[:, 0:-1]
xh = df.iloc[:, 0]
del df
gc.collect()
# 训练数据与测试数据中xh所占比例一致
X_datat, X_datav, y_train, y_valid = train_test_split(X, y, test_size=0.25, stratify=xh)
del y
del X
del xh
gc.collect()
X_train = X_datat.iloc[:, 1:]
X_valid = X_datav.iloc[:, 1:]
X_valid_xh = X_datav.iloc[:, 0]
del X_datat
del X_datav
gc.collect()
# 线程:num_threads
gbm = lgb.LGBMRegressor(objective='regression',learning_rate=0.3,n_estimators=50,num_threads=8)
# num_leaves 31,max_bin 255,histogram_pool_size -1:通过减小此值来降低内存占用
#gbm = lgb.LGBMRegressor(objective='regression',num_leaves=31,learning_rate=0.3,n_estimators=100,num_threads=2,histogram_pool_size=512,max_bin=25,two_round=true)
# l2--mse, l1--mae
gbm.fit(X_train, y_train,eval_set=[(X_valid, y_valid)],eval_metric='l1',early_stopping_rounds=50, verbose=True)
# 绘制评价曲线
results = gbm.evals_result_
epochs = len(results['valid_0']['l1'])
x_axis = range(0, epochs)
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['valid_0']['l1'], label='Test')
ax.legend()
pyplot.ylabel('Regres mae')
pyplot.title('lightgbm Regressor mae')
pyplot.show()
# 特征重要性
print('Feature importances:', list(gbm.feature_importances_))
# 保存与加载模型
import pickle
pickle.dump(gbm, open("1001.model", "wb"))
gbm = pickle.load(open("1001.model", "rb"))
print('Start predicting...')
# 预测
valid_pre = gbm.predict(X_valid, num_iteration=gbm.best_iteration_)
redf = DataFrame(valid_pre, columns=['pre'])
l=pd.concat( [X_valid_xh.reset_index(drop=True), y_valid.reset_index(drop=True), redf], axis=1 )
l.to_csv('result_3.csv', index=False)
交叉验证
estimator = lgb.LGBMRegressor(num_leaves=31)
param_grid = { 'learning_rate': [0.01, 0.1, 1], 'n_estimators': [20, 40]}
gbm = GridSearchCV(estimator, param_grid)
gbm.fit(X_train, y_train)
print('Best parameters found by grid search are:', gbm.best_params_)