Feature Importance using permuation importance:
LSTM Feature Importance | Kaggle
第一步:定义特征:
示例:
def add_features(df):
df['area'] = df['time_step'] * df['u_in']
df['area'] = df.groupby('breath_id')['area'].cumsum()
df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
df['u_in_lag1'] = df.groupby('breath_id')['u_in'].shift(1)
df['u_out_lag1'] = df.groupby('breath_id')['u_out'].shift(1)
df['u_in_lag_back1'] = df.groupby('breath_id')['u_in'].shift(-1)
df['u_out_lag_back1'] = df.groupby('breath_id')['u_out'].shift(-1)
df['u_in_lag2'] = df.groupby('breath_id')['u_in'].shift(2)
df['u_out_lag2'] = df.groupby('breath_id')['u_out'].shift(2)
df['u_in_lag_back2'] = df.groupby('breath_id')['u_in'].shift(-2)
df['u_out_lag_back2'] = df.groupby('breath_id')['u_out'].shift(-2)
df['u_in_lag3'] = df.groupby('breath_id')['u_in'].shift(3)
df['u_out_lag3'] = df.groupby('breath_id')['u_out'].shift(3)
df['u_in_lag_back3'] = df.groupby('breath_id')['u_in'].shift(-3)
df['u_out_lag_back3'] = df.groupby('breath_id')['u_out'].shift(-3)
df['u_in_lag4'] = df.groupby('breath_id')['u_in'].shift(4)
df['u_out_lag4'] = df.groupby('breath_id')['u_out'].shift(4)
df['u_in_lag_back4'] = df.groupby('breath_id')['u_in'].shift(-4)
df['u_out_lag_back4'] = df.groupby('breath_id')['u_out'].shift(-4)
df = df.fillna(0)
df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
df['breath_id__u_out__max'] = df.groupby(['breath_id'])['u_out'].transform('max')
df['u_in_diff1'] = df['u_in'] - df['u_in_lag1']
df['u_out_diff1'] = df['u_out'] - df['u_out_lag1']
df['u_in_diff2'] = df['u_in'] - df['u_in_lag2']
df['u_out_diff2'] = df['u_out'] - df['u_out_lag2']
df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
df['u_in_diff3'] = df['u_in'] - df['u_in_lag3']
df['u_out_diff3'] = df['u_out'] - df['u_out_lag3']
df['u_in_diff4'] = df['u_in'] - df['u_in_lag4']
df['u_out_diff4'] = df['u_out'] - df['u_out_lag4']
df['cross']= df['u_in']*df['u_out']
df['cross2']= df['time_step']*df['u_out']
df['R'] = df['R'].astype(str)
df['C'] = df['C'].astype(str)
df['R__C'] = df["R"].astype(str) + '__' + df["C"].astype(str)
df = pd.get_dummies(df)
return df
train = add_features(train)
test = add_features(test)
print('Train dataframe shape',train.shape)
train.head()
第二步:导入模型并计算特征重要性:
EPOCH = 300
BATCH_SIZE = 1024
NUM_FOLDS = 10
#train or import model,True or False
TRAIN_MODEL = False
# detect and init the TPU
#tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
# instantiate a distribution strategy
#tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
# GET GPU STRATEGY
gpu_strategy = tf.distribute.get_strategy()
with gpu_strategy.scope():
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=2021)
test_preds = []
for fold, (train_idx, test_idx) in enumerate(kf.split(train, targets)):
K.clear_session()
print('-'*15, '>', f'Fold {fold+1}', '<', '-'*15)
X_train, X_valid = train[train_idx], train[test_idx]
y_train, y_valid = targets[train_idx], targets[test_idx]
checkpoint_filepath = f"folds{fold}.hdf5"
if TRAIN_MODEL:
model = keras.models.Sequential([
keras.layers.Input(shape=train.shape[-2:]),
keras.layers.Bidirectional(keras.layers.LSTM(1024, return_sequences=True)),
keras.layers.Bidirectional(keras.layers.LSTM(512, return_sequences=True)),
keras.layers.Bidirectional(keras.layers.LSTM(256, return_sequences=True)),
keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=True)),
keras.layers.Dense(128, activation='selu'),
keras.layers.Dense(1),
])
model.compile(optimizer="adam", loss="mae")
lr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=10, verbose=1)
es = EarlyStopping(monitor="val_loss", patience=60, verbose=1, mode="min", restore_best_weights=True)
sv = keras.callbacks.ModelCheckpoint(
checkpoint_filepath, monitor='val_loss', verbose=1, save_best_only=True,
save_weights_only=False, mode='auto', save_freq='epoch',
options=None
)
model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=EPOCH, batch_size=BATCH_SIZE, callbacks=[lr, es, sv])
else:
model = keras.models.load_model('../input/finetune-of-tensorflow-bidirectional-lstm/'+checkpoint_filepath)
if INFER_TEST:
print(' Predicting test data...')
test_preds.append(model.predict(test,verbose=0).squeeze().reshape(-1, 1).squeeze())
if COMPUTE_LSTM_IMPORTANCE:
results = []
print(' Computing LSTM feature importance...')
# COMPUTE BASELINE (NO SHUFFLE)
oof_preds = model.predict(X_valid, verbose=0).squeeze()
baseline_mae = np.mean(np.abs( oof_preds-y_valid ))
results.append({'feature':'BASELINE','mae':baseline_mae})
for k in tqdm(range(len(COLS))):
# SHUFFLE FEATURE K
save_col = X_valid[:,:,k].copy()
np.random.shuffle(X_valid[:,:,k])
# COMPUTE OOF MAE WITH FEATURE K SHUFFLED
oof_preds = model.predict(X_valid, verbose=0).squeeze()
mae = np.mean(np.abs( oof_preds-y_valid ))
results.append({'feature':COLS[k],'mae':mae})
X_valid[:,:,k] = save_col
# DISPLAY LSTM FEATURE IMPORTANCE
print()
df = pd.DataFrame(results)
df = df.sort_values('mae')
plt.figure(figsize=(10,20))
plt.barh(np.arange(len(COLS)+1),df.mae)
plt.yticks(np.arange(len(COLS)+1),df.feature.values)
plt.title('LSTM Feature Importance',size=16)
plt.ylim((-1,len(COLS)+1))
plt.plot([baseline_mae,baseline_mae],[-1,len(COLS)+1], '--', color='orange',
label=f'Baseline OOF\nMAE={baseline_mae:.3f}')
plt.xlabel(f'Fold {fold+1} OOF MAE with feature permuted',size=14)
plt.ylabel('Feature',size=14)
plt.legend()
plt.show()
# SAVE LSTM FEATURE IMPORTANCE
df = df.sort_values('mae',ascending=False)
df.to_csv(f'lstm_feature_importance_fold_{fold+1}.csv',index=False)
# ONLY DO ONE FOLD
if ONE_FOLD_ONLY: break