利用Keras搭建神经网络做多分类任务

简单记录利用Keras搭建神经网络做多分类任务(我用搭建的是结构化数据,相对于图像和NLP简单)

  1. 首先导入各种包,用于后期搭建网络和训练模型
  2. 读取数据(结构化比赛的数据,比图像和NLP的多分类的网络框架简单点)
  3. features为即将训练的特征
  4. reduce_mem_usage,降低数据内村
  5. 标准化数据
  6. 划分训练集和测试集
  7. 搭建多分类神经网络模型
  8. 画训练和验证集的auc和loss对比图

注意:

  1. 训练数据中不能由缺失值,不然训练可能导致loss为NAN
  2. 记得归一化
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import re
import pandas as pd

from catboost import CatBoostRegressor
import lightgbm as lgb
import xgboost as xgb
import numpy as np
from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, average_precision_score
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.model_selection import train_test_split
import seaborn as sns
import tensorflow as F 
from tensorflow.python.keras.utils.np_utils import to_categorical
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import RMSprop
import tensorflow as tf
tf.config.run_functions_eagerly(True)
from numpy.random import seed 
seed(1) 
F.random.set_seed(2)
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False

train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./pre_contest_test1.csv')
print(f'训练数据情况:{train_df.shape}、测试数据情况:{test_df.shape}')
# 填充缺失值
train_df.fillna(train_df.mean(),inplace=True)
test_df.fillna(test_df.mean(),inplace=True)
df = pd.concat([train_df,test_df]).reset_index(drop=True)
features = [i for i in df.columns if i not in ['sample_id', 
                                               'feature1','feature5','feature7','feature10','feature11','feature13','feature17','feature19','feature22','feature28',
                                               'feature34','feature35','feature45','feature46','feature47','feature58','feature59','feature62','feature67','feature69','feature70',
                                               'feature77','feature78','feature80','feature82','feature83','feature85','feature86','feature88','feature90','feature102','feature103',
                                               'label']]

def reduce_mem_usage(df):

    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in tqdm(features):
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

df = reduce_mem_usage(df)

# 归一化
df[features] = df[features].apply(lambda x:(x-np.min(x))/(np.max(x)-np.min(x)))
# 划分测试集
test = df[df['label'].isna()].reset_index(drop=True)[features]
train_x = df[~df['label'].isna()].reset_index(drop=True)[features]
train_y = train_df['label']

X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size = 0.2, 
                                                random_state = 7, stratify = train_y)

print("shape of Xtrain, Xtest:",X_train.shape, X_test.shape)
# 标签格式处理
y_train = to_categorical(y_train, 6)
y_test = to_categorical(y_test, 6)

# 模型
model = Sequential()
model.add(Dense(128, activation = "relu", input_shape =(73,)))
model.add(Dense(64, activation = "relu"))
model.add(Dense(6, activation ="softmax"))
model.compile(loss='categorical_crossentropy',
              optimizer='Adam',
              metrics = ['accuracy'])
# 训练
history = model.fit(X_train,y_train, batch_size = 32, epochs=60,validation_data=(X_test, y_test))

# 画训练和验证集的auc和loss对比图
epochs=range(len(history.history['accuracy']))
plt.figure()
plt.plot(epochs,history.history['accuracy'],'b',label='Training acc')
plt.plot(epochs,history.history['val_accuracy'],'r',label='Validation acc')
plt.title('Traing and Validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs,history.history['loss'],'b',label='Training loss')
plt.plot(epochs,history.history['val_loss'],'r',label='Validation val_loss')
plt.title('Traing and Validation loss')
plt.legend()

如果想在训练多分类模型时,想使用F1进行验证每轮的效果,做做以下两步:
第一步:加入Macro-F1: 根据每一个类别的准召率计算F1值,然后求均值. 忽略了样本间分布出现的不平衡问题.

# 这是macro评价代码
from sklearn.metrics import f1_score,recall_score, precision_score
class Metrics(F.keras.callbacks.Callback):
    def __init__(self, valid_data):
        super(Metrics, self).__init__()
        self.validation_data = valid_data
 
    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        val_predict = np.argmax(self.model.predict(self.validation_data[0]), -1)
        val_targ = self.validation_data[1]
        if len(val_targ.shape) == 2 and val_targ.shape[1] != 1:
            val_targ = np.argmax(val_targ, -1)
 
        _val_f1 = f1_score(val_targ, val_predict, average='macro')
        _val_recall = recall_score(val_targ, val_predict, average='macro')
        _val_precision = precision_score(val_targ, val_predict, average='macro')
 
        logs['val_f1'] = _val_f1
        logs['val_recall'] = _val_recall
        logs['val_precision'] = _val_precision
        print(" — val_f1: %f " % (_val_f1))
        return

或者:Micro-F1: 不区分类别,直接用总体样本的准召率计算F1-score.

# 这是macro评价代码
from sklearn.metrics import f1_score,recall_score, precision_score
class Metrics(F.keras.callbacks.Callback):
    def __init__(self, valid_data):
        super(Metrics, self).__init__()
        self.validation_data = valid_data
 
    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        val_predict = np.argmax(self.model.predict(self.validation_data[0]), -1)
        val_targ = self.validation_data[1]
        if len(val_targ.shape) == 2 and val_targ.shape[1] != 1:
            val_targ = np.argmax(val_targ, -1)
 
        _val_f1 = f1_score(val_targ, val_predict, average='micro')
        _val_recall = recall_score(val_targ, val_predict, average='micro')
        _val_precision = precision_score(val_targ, val_predict, average='micro')
 
        logs['val_f1'] = _val_f1
        logs['val_recall'] = _val_recall
        logs['val_precision'] = _val_precision
        print(" — val_f1: %f " % (_val_f1))
        return

第二步:修改上面的训练代码
由:

history = model.fit(X_train,y_train, batch_size = 32, epochs=60,validation_data=(X_test, y_test))

修改成:


```python
history = model.fit(X_train,y_train, batch_size = 32, epochs=60,validation_data=(X_test, y_test),callbacks=Metrics(valid_data=(X_test, y_test)))

参考:
https://blog.csdn.net/Reberkah/article/details/106620131

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值