lightGBM 分类实战

import pandas as pd
import numpy as np
import os
import warnings
from sklearn.metrics import f1_score,accuracy_score
from sklearn.model_selection import StratifiedKFold, KFold
from tqdm import tqdm
import lightgbm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix


data = pd.read_csv('water_data.csv',index_col=0)
data['TN'] = data['TN'].apply(pd.to_numeric, errors='coerce')
data['TEMP'] = data['TEMP'].apply(pd.to_numeric, errors='coerce')
data['COND'] = data['COND'].apply(pd.to_numeric, errors='coerce')
data['TURB'] = data['TURB'].apply(pd.to_numeric, errors='coerce')
data['lable']=data['lable']-1

data_train=data[:3600]
data_test = data[3600:]

X_train = data_train.iloc[:,:9]
y_train = data_train['lable']

X_validation = data_test.iloc[:,:9]
y_validation = data_test['lable'] 


#X_train, X_validation, y_train, y_validation = train_test_split(data.iloc[:,:-1],data.iloc[:,-1],test_size=0.2 , random_state=1234)
train_x = X_train
train_y = y_train
train_y=pd.DataFrame(train_y)
test = X_validation
train=pd.concat([train_x,train_y],axis=1)

params = {'num_leaves': 40,
          'min_data_in_leaf': 30,
          'objective': 'multiclass',
          'num_class': 6,
          'max_depth': -1,
          'learning_rate': 0.05,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction": 0.9,
          "bagging_freq": 1,
          "bagging_fraction": 0.9,
          "bagging_seed": 11,
          "lambda_l1": 0.1,
          "verbosity": -1,
          "nthread": 15,
          'metric': 'multi_logloss',
          "random_state": 2019,
          # 'device': 'gpu' 
          }

features = X_train.columns
folds = KFold(n_splits=5, shuffle=True, random_state=2019)
prob_oof = np.zeros((train_x.shape[0], 6))
test_pred_prob = np.zeros((test.shape[0], 6))
num_round=100
## train and predict
feature_importance_df = pd.DataFrame()
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train)):
    print("fold {}".format(fold_ + 1))
    trn_data = lightgbm.Dataset(train_x.iloc[trn_idx], label=train_y.iloc[trn_idx])
    val_data = lightgbm.Dataset(train_x.iloc[val_idx], label=train_y.iloc[val_idx])

    clf = lightgbm.train(params,
                    trn_data,
                    num_round,
                    valid_sets=[trn_data, val_data],
                    verbose_eval=20,
                    early_stopping_rounds=60)
    prob_oof[val_idx] = clf.predict(train_x.iloc[val_idx], num_iteration=clf.best_iteration)


    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

    test_pred_prob += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

result = np.argmax(test_pred_prob, axis=1)
y_pred=result




lightgbm.create_tree_digraph(clf, 
                             tree_index=0, 
                             show_info=None, 
                             precision=3, 
                             orientation='horizontal')



lightgbm.plot_tree(clf, 
                   ax=None, 
                   tree_index=0, 
                   figsize=None, 
                   dpi=None, 
                   show_info=None, 
                   precision=3, 
                   orientation='horizontal')

###
lightgbm.plot_metric(clf, 
                     metric=None, 
                     dataset_names=None, 
                     ax=None, xlim=None, 
                     ylim=None, 
                     title='Metric during training', 
                     xlabel='Iterations',
                     ylabel='auto', 
                     figsize=None, 
                     dpi=None, 
                     grid=True)

#特征重要度排序
lightgbm.plot_importance(clf, 
                    ax=None, 
                    height=0.2, 
                    xlim=None, 
                    ylim=None, 
                    title='Feature importance', 
                    xlabel='Feature importance', 
                    ylabel='Features', 
                    importance_type='split', 
                    max_num_features=None, 
                    ignore_zero=True, 
                    figsize=None, 
                    dpi=None, 
                    grid=True, 
                    precision=3)



lightgbm.plot_split_value_histogram(clf, 
                                    features, 
                                    bins=None, 
                                    ax=None, 
                                    width_coef=0.8, 
                                    xlim=None, 
                                    ylim=None, 
                                    title='Split value histogram for feature with @index/name@ @feature@', 
                                    xlabel='Feature split value', 
                                    ylabel='Count', 
                                    figsize=None, 
                                    dpi=None, 
                                    grid=True)



# 特征重要度排序
fea_ = clf.feature_importance()
fea_name = features
plt.figure(figsize=(10, 10))
plt.barh(fea_name,fea_,height =0.5)


# 模型评价
f1 = f1_score( y_pred,y_validation, average='macro')
print("f1=", f1)
acc = accuracy_score(y_pred, y_validation)
print("acc", acc)


#混淆矩阵
def plot_confusion_matrix(cm, savename, title='Confusion Matrix'):

    plt.figure(figsize=(12, 8), dpi=100)
    np.set_printoptions(precision=2)

    # 在混淆矩阵中每格的概率值
    ind_array = np.arange(len(classes))
    x, y = np.meshgrid(ind_array, ind_array)
    for x_val, y_val in zip(x.flatten(), y.flatten()):
        c = cm[y_val][x_val]
        if c > 0.001:
            plt.text(x_val, y_val, "%0.0f" % (c,), color='red', fontsize=15, va='center', ha='center')
    
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.binary)
    plt.title(title)
    plt.colorbar()
    xlocations = np.array(range(len(classes)))
    plt.xticks(xlocations, classes, rotation=90)
    plt.yticks(xlocations, classes)
    plt.ylabel('Actual label')
    plt.xlabel('Predict label')
    
    # offset the tick
    tick_marks = np.array(range(len(classes))) + 0.5
    plt.gca().set_xticks(tick_marks, minor=True)
    plt.gca().set_yticks(tick_marks, minor=True)
    plt.gca().xaxis.set_ticks_position('none')
    plt.gca().yaxis.set_ticks_position('none')
    plt.grid(True, which='minor', linestyle='-')
    plt.gcf().subplots_adjust(bottom=0.15)
    
    # show confusion matrix
    plt.savefig(savename, format='png')
    plt.show()
    
classes = ['I', 'II', 'III', 'IV', 'V', '劣V']

y_true = y_validation
# 获取混淆矩阵
cm = confusion_matrix(y_true, y_pred)
plot_confusion_matrix(cm, 'confusion_matrix.png', title='confusion matrix')


y_true_index=pd.DataFrame(list(y_true.index))
y_true_list=pd.DataFrame(list(y_true))
y_pred_list=pd.DataFrame(list(y_pred))
result = pd.concat([y_true_index, y_true_list], axis=1)
result = pd.concat([result, y_pred_list], axis=1)

在这里插入图片描述
在这里插入图片描述

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值