简单记录利用Keras搭建神经网络做多分类任务(我用搭建的是结构化数据,相对于图像和NLP简单)
- 首先导入各种包,用于后期搭建网络和训练模型
- 读取数据(结构化比赛的数据,比图像和NLP的多分类的网络框架简单点)
features
为即将训练的特征reduce_mem_usage
,降低数据内村- 标准化数据
- 划分训练集和测试集
- 搭建多分类神经网络模型
- 画训练和验证集的auc和loss对比图
注意
:
- 训练数据中不能由缺失值,不然训练可能导致loss为NAN
- 记得归一化
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import re
import pandas as pd
from catboost import CatBoostRegressor
import lightgbm as lgb
import xgboost as xgb
import numpy as np
from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, average_precision_score
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.model_selection import train_test_split
import seaborn as sns
import tensorflow as F
from tensorflow.python.keras.utils.np_utils import to_categorical
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import RMSprop
import tensorflow as tf
tf.config.run_functions_eagerly(True)
from numpy.random import seed
seed(1)
F.random.set_seed(2)
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./pre_contest_test1.csv')
print(f'训练数据情况:{train_df.shape}、测试数据情况:{test_df.shape}')
# 填充缺失值
train_df.fillna(train_df.mean(),inplace=True)
test_df.fillna(test_df.mean(),inplace=True)
df = pd.concat([train_df,test_df]).reset_index(drop=True)
features = [i for i in df.columns if i not in ['sample_id',
'feature1','feature5','feature7','feature10','feature11','feature13','feature17','feature19','feature22','feature28',
'feature34','feature35','feature45','feature46','feature47','feature58','feature59','feature62','feature67','feature69','feature70',
'feature77','feature78','feature80','feature82','feature83','feature85','feature86','feature88','feature90','feature102','feature103',
'label']]
def reduce_mem_usage(df):
start_mem = df.memory_usage().sum() / 1024**2
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
for col in tqdm(features):
col_type = df[col].dtype
if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
else:
df[col] = df[col].astype('category')
end_mem = df.memory_usage().sum() / 1024**2
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
return df
df = reduce_mem_usage(df)
# 归一化
df[features] = df[features].apply(lambda x:(x-np.min(x))/(np.max(x)-np.min(x)))
# 划分测试集
test = df[df['label'].isna()].reset_index(drop=True)[features]
train_x = df[~df['label'].isna()].reset_index(drop=True)[features]
train_y = train_df['label']
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size = 0.2,
random_state = 7, stratify = train_y)
print("shape of Xtrain, Xtest:",X_train.shape, X_test.shape)
# 标签格式处理
y_train = to_categorical(y_train, 6)
y_test = to_categorical(y_test, 6)
# 模型
model = Sequential()
model.add(Dense(128, activation = "relu", input_shape =(73,)))
model.add(Dense(64, activation = "relu"))
model.add(Dense(6, activation ="softmax"))
model.compile(loss='categorical_crossentropy',
optimizer='Adam',
metrics = ['accuracy'])
# 训练
history = model.fit(X_train,y_train, batch_size = 32, epochs=60,validation_data=(X_test, y_test))
# 画训练和验证集的auc和loss对比图
epochs=range(len(history.history['accuracy']))
plt.figure()
plt.plot(epochs,history.history['accuracy'],'b',label='Training acc')
plt.plot(epochs,history.history['val_accuracy'],'r',label='Validation acc')
plt.title('Traing and Validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs,history.history['loss'],'b',label='Training loss')
plt.plot(epochs,history.history['val_loss'],'r',label='Validation val_loss')
plt.title('Traing and Validation loss')
plt.legend()
如果想在训练多分类模型时,想使用F1进行验证每轮的效果,做做以下两步:
第一步:加入Macro-F1: 根据每一个类别的准召率计算F1值,然后求均值. 忽略了样本间分布出现的不平衡问题.
# 这是macro评价代码
from sklearn.metrics import f1_score,recall_score, precision_score
class Metrics(F.keras.callbacks.Callback):
def __init__(self, valid_data):
super(Metrics, self).__init__()
self.validation_data = valid_data
def on_epoch_end(self, epoch, logs=None):
logs = logs or {}
val_predict = np.argmax(self.model.predict(self.validation_data[0]), -1)
val_targ = self.validation_data[1]
if len(val_targ.shape) == 2 and val_targ.shape[1] != 1:
val_targ = np.argmax(val_targ, -1)
_val_f1 = f1_score(val_targ, val_predict, average='macro')
_val_recall = recall_score(val_targ, val_predict, average='macro')
_val_precision = precision_score(val_targ, val_predict, average='macro')
logs['val_f1'] = _val_f1
logs['val_recall'] = _val_recall
logs['val_precision'] = _val_precision
print(" — val_f1: %f " % (_val_f1))
return
或者:Micro-F1: 不区分类别,直接用总体样本的准召率计算F1-score.
# 这是macro评价代码
from sklearn.metrics import f1_score,recall_score, precision_score
class Metrics(F.keras.callbacks.Callback):
def __init__(self, valid_data):
super(Metrics, self).__init__()
self.validation_data = valid_data
def on_epoch_end(self, epoch, logs=None):
logs = logs or {}
val_predict = np.argmax(self.model.predict(self.validation_data[0]), -1)
val_targ = self.validation_data[1]
if len(val_targ.shape) == 2 and val_targ.shape[1] != 1:
val_targ = np.argmax(val_targ, -1)
_val_f1 = f1_score(val_targ, val_predict, average='micro')
_val_recall = recall_score(val_targ, val_predict, average='micro')
_val_precision = precision_score(val_targ, val_predict, average='micro')
logs['val_f1'] = _val_f1
logs['val_recall'] = _val_recall
logs['val_precision'] = _val_precision
print(" — val_f1: %f " % (_val_f1))
return
第二步:修改上面的训练代码
由:
history = model.fit(X_train,y_train, batch_size = 32, epochs=60,validation_data=(X_test, y_test))
修改成:
```python
history = model.fit(X_train,y_train, batch_size = 32, epochs=60,validation_data=(X_test, y_test),callbacks=Metrics(valid_data=(X_test, y_test)))
参考:
https://blog.csdn.net/Reberkah/article/details/106620131