#第一种方式 # Sklearn中有很好的开源数据,去供我们学习,例如说boston的回归数据,以及鸢尾花的分类数据等等等等,因此,要求使用到鸢尾花数据,实现LightGBM算法的分类任务。 # # 要求: # 1.导如相关的工具包,包括:LightGBM的包(4分) import lightgbm as lgb from sklearn import datasets from sklearn.model_selection import train_test_split from lightgbm import LGBMClassifier # 2.需要再Sklearn中获取到鸢尾花的分类数据(8分) iris = datasets.load_iris() # 3.鸢尾花数据由不同的键值对储存,因此,请按照不同的键值,获取到对应的数据,特征和标签(8分) X = iris.data Y = iris.target # 4.获取到相应的数据之后,接下来需要对数据进行处理,例如数据的切分(8分) # 5.请将获取到的数据,按照7:3的比例进行正确的切分生成想对应的train_x,test_x,train_y,test_y(8分) train_x,test_x,train_y,test_y = train_test_split(X,Y,test_size=0.3) # 6.数据区分完成之后,需要分别对训练集和测试集的数据进行处理,首先将训练集数据转换成二进制,可以更快的进行处理(8分) train_data = lgb.Dataset(train_x,train_y) test_data = lgb.Dataset(test_x,test_y) # 7.接下来创建验证数据(8分) # 8.为参数定义一个字典,1.设置提升类型为gbdt,2.目标函数为regression,3.评估函数为{'l2', 'auc'}(8分) # 9.4.叶子结点数为31,5.学习率为:0.05,6.建树的特征选择比例为0.9,7.建树的样本比例为0.8(8分) # 10.将以上7点的参数正确的定义出来并放到字典中(8分) params = { 'task':'train', 'boosting_type':'gbdt', 'objective':'regression', 'metric':{'l2','auc'}, 'num_leaves':31, 'learning_rate':0.05, 'feature_fraction':0.9, 'bagging_fraction':0.8, } model = lgb.train(params,train_data,num_boost_round=100) # 11.将数据正确的传入,然后训练LightGBM模型(8分) # 12.为测试集预测相应的输出标签(8分) pre = model.predict(test_x) # 13.计算模型的最终得分,并展示结果(8分) from sklearn.metrics import mean_squared_error print('最终得分',mean_squared_error(test_y,pre))
#第二种方式
import sklearn from sklearn.datasets import load_boston import lightgbm as lgb import pickle # from lightgbm import LGBMRegressor as lgb import pandas as pd from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error from sklearn.model_selection import GridSearchCV import warnings warnings.filterwarnings('ignore') # 加载数据 boston = load_boston() data = boston.data target = boston.target X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2) # 创建模型,训练模型 gbm = lgb.LGBMRegressor(objective='regression', num_leaves=31, learning_rate=0.05, n_estimators=20) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric='l1', early_stopping_rounds=5) # gbm.booster_.save_model(r'lightGBM.model') pickle.dump(gbm, open(r'model.txt', 'wb')) # 6.将模型读取出来,并进行测试,打印精度和混淆矩阵 gbm = pickle.load(open(r'model.txt', 'rb')) # 测试机预测 y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_) # 模型评估 print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5) feat_importance = pd.DataFrame({ 'column': gbm.feature_name_, 'importance': list(gbm.feature_importances_) }) feat_importance = feat_importance.sort_values(by='importance') feat_importance.plot.barh(x='column', figsize=(10, 12)) # LightGBM模型可视化,绘制最后一棵树 lgb.create_tree_digraph(gbm, tree_index=gbm.best_iteration_ - 1) # 网格搜索,参数优化 estimator = lgb.LGBMRegressor(num_leaves=31) param_grid = { 'learning_rate': [0.01, 0.1, 1], 'n_estimators': [20, 40] } gbm = GridSearchCV(estimator, param_grid) gbm.fit(X_train, y_train) print('Best parameters found by grid search are:', gbm.best_params_) #选定网格搜索的候选参数值,送入模型中进行训练并返回最优参数值