import pandas as pd
import numpy as np
import os
import warnings
from sklearn.metrics import f1_score,accuracy_score
from sklearn.model_selection import StratifiedKFold, KFold
from tqdm import tqdm
import lightgbm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
data = pd.read_csv('water_data.csv',index_col=0)
data['TN'] = data['TN'].apply(pd.to_numeric, errors='coerce')
data['TEMP'] = data['TEMP'].apply(pd.to_numeric, errors='coerce')
data['COND'] = data['COND'].apply(pd.to_numeric, errors='coerce')
data['TURB'] = data['TURB'].apply(pd.to_numeric, errors='coerce')
data['lable']=data['lable']-1
data_train=data[:3600]
data_test = data[3600:]
X_train = data_train.iloc[:,:9]
y_train = data_train['lable']
X_validation = data_test.iloc[:,:9]
y_validation = data_test['lable']
#X_train, X_validation, y_train, y_validation = train_test_split(data.iloc[:,:-1],data.iloc[:,-1],test_size=0.2 , random_state=1234)
train_x = X_train
train_y = y_train
train_y=pd.DataFrame(train_y)
test = X_validation
train=pd.concat([train_x,train_y],axis=1)
params = {'num_leaves': 40,
'min_data_in_leaf': 30,
'objective': 'multiclass',
'num_class': 6,
'max_depth': -1,
'learning_rate': 0.05,
"min_sum_hessian_in_leaf": 6,
"boosting": "gbdt",
"feature_fraction": 0.9,
"bagging_freq": 1,
"bagging_fraction": 0.9,
"bagging_seed": 11,
"lambda_l1": 0.1,
"verbosity": -1,
"nthread": 15,
'metric': 'multi_logloss',
"random_state": 2019,
# 'device': 'gpu'
}
features = X_train.columns
folds = KFold(n_splits=5, shuffle=True, random_state=2019)
prob_oof = np.zeros((train_x.shape[0], 6))
test_pred_prob = np.zeros((test.shape[0], 6))
num_round=100
## train and predict
feature_importance_df = pd.DataFrame()
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train)):
print("fold {}".format(fold_ + 1))
trn_data = lightgbm.Dataset(train_x.iloc[trn_idx], label=train_y.iloc[trn_idx])
val_data = lightgbm.Dataset(train_x.iloc[val_idx], label=train_y.iloc[val_idx])
clf = lightgbm.train(params,
trn_data,
num_round,
valid_sets=[trn_data, val_data],
verbose_eval=20,
early_stopping_rounds=60)
prob_oof[val_idx] = clf.predict(train_x.iloc[val_idx], num_iteration=clf.best_iteration)
fold_importance_df = pd.DataFrame()
fold_importance_df["Feature"] = features
fold_importance_df["importance"] = clf.feature_importance()
fold_importance_df["fold"] = fold_ + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
test_pred_prob += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits
result = np.argmax(test_pred_prob, axis=1)
y_pred=result
lightgbm.create_tree_digraph(clf,
tree_index=0,
show_info=None,
precision=3,
orientation='horizontal')
lightgbm.plot_tree(clf,
ax=None,
tree_index=0,
figsize=None,
dpi=None,
show_info=None,
precision=3,
orientation='horizontal')
###
lightgbm.plot_metric(clf,
metric=None,
dataset_names=None,
ax=None, xlim=None,
ylim=None,
title='Metric during training',
xlabel='Iterations',
ylabel='auto',
figsize=None,
dpi=None,
grid=True)
#特征重要度排序
lightgbm.plot_importance(clf,
ax=None,
height=0.2,
xlim=None,
ylim=None,
title='Feature importance',
xlabel='Feature importance',
ylabel='Features',
importance_type='split',
max_num_features=None,
ignore_zero=True,
figsize=None,
dpi=None,
grid=True,
precision=3)
lightgbm.plot_split_value_histogram(clf,
features,
bins=None,
ax=None,
width_coef=0.8,
xlim=None,
ylim=None,
title='Split value histogram for feature with @index/name@ @feature@',
xlabel='Feature split value',
ylabel='Count',
figsize=None,
dpi=None,
grid=True)
# 特征重要度排序
fea_ = clf.feature_importance()
fea_name = features
plt.figure(figsize=(10, 10))
plt.barh(fea_name,fea_,height =0.5)
# 模型评价
f1 = f1_score( y_pred,y_validation, average='macro')
print("f1=", f1)
acc = accuracy_score(y_pred, y_validation)
print("acc", acc)
#混淆矩阵
def plot_confusion_matrix(cm, savename, title='Confusion Matrix'):
plt.figure(figsize=(12, 8), dpi=100)
np.set_printoptions(precision=2)
# 在混淆矩阵中每格的概率值
ind_array = np.arange(len(classes))
x, y = np.meshgrid(ind_array, ind_array)
for x_val, y_val in zip(x.flatten(), y.flatten()):
c = cm[y_val][x_val]
if c > 0.001:
plt.text(x_val, y_val, "%0.0f" % (c,), color='red', fontsize=15, va='center', ha='center')
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.binary)
plt.title(title)
plt.colorbar()
xlocations = np.array(range(len(classes)))
plt.xticks(xlocations, classes, rotation=90)
plt.yticks(xlocations, classes)
plt.ylabel('Actual label')
plt.xlabel('Predict label')
# offset the tick
tick_marks = np.array(range(len(classes))) + 0.5
plt.gca().set_xticks(tick_marks, minor=True)
plt.gca().set_yticks(tick_marks, minor=True)
plt.gca().xaxis.set_ticks_position('none')
plt.gca().yaxis.set_ticks_position('none')
plt.grid(True, which='minor', linestyle='-')
plt.gcf().subplots_adjust(bottom=0.15)
# show confusion matrix
plt.savefig(savename, format='png')
plt.show()
classes = ['I', 'II', 'III', 'IV', 'V', '劣V']
y_true = y_validation
# 获取混淆矩阵
cm = confusion_matrix(y_true, y_pred)
plot_confusion_matrix(cm, 'confusion_matrix.png', title='confusion matrix')
y_true_index=pd.DataFrame(list(y_true.index))
y_true_list=pd.DataFrame(list(y_true))
y_pred_list=pd.DataFrame(list(y_pred))
result = pd.concat([y_true_index, y_true_list], axis=1)
result = pd.concat([result, y_pred_list], axis=1)