kaggle项目TPUs调试记录

最新推荐文章于 2023-09-29 11:14:39 发布

Abalizzw

最新推荐文章于 2023-09-29 11:14:39 发布

阅读量402

点赞数

分类专栏：分类问题文章标签：深度学习机器学习 kaggle

本文链接：https://blog.csdn.net/souyan1991/article/details/109722018

版权

分类问题专栏收录该内容

1 篇文章 0 订阅

订阅专栏

代码部分

from tensorflow.keras.applications import DenseNet201

def get_model():
    with strategy.scope():
        rnet = DenseNet201(
            input_shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], 3),
            weights='imagenet',
            include_top=False
            
        )
        # trainable rnet
        rnet.trainable = True
        model = tf.keras.Sequential([
            rnet,
            tf.keras.layers.GlobalAveragePooling2D(),
            tf.keras.layers.Dense(len(CLASSES), activation='softmax',dtype='float32')
        ])
    model.compile(
        optimizer='adam',
        loss = 'sparse_categorical_crossentropy',
        metrics=['sparse_categorical_accuracy']
    )
    return model

def train_cross_validate(folds):
    histories = []
    models = []
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', patience = 3)
    kfold = KFold(folds, shuffle = False, random_state = SEED)
    
    f=0
    trn_ind=0
    val_ind=0
    next(kfold.split(TRAINING_FILENAMES))
    print(); print('#'*25)
    print('### FOLD',f+1)
    print('#'*25)
    train_dataset = load_dataset(list(pd.DataFrame({'TRAINING_FILENAMES': TRAINING_FILENAMES}).loc[trn_ind]['TRAINING_FILENAMES']), labeled = True)
    val_dataset = load_dataset(list(pd.DataFrame({'TRAINING_FILENAMES': TRAINING_FILENAMES}).loc[val_ind]['TRAINING_FILENAMES']), labeled = True, ordered = True)
    model = get_model()
    history = model.fit(
        get_training_dataset(train_dataset), 
        steps_per_epoch = STEPS_PER_EPOCH,
        epochs = EPOCHS,
        callbacks = [lr_callback],#, early_stopping],
        validation_data = get_validation_dataset(val_dataset),
        verbose=2
    )
    models.append(model)
    histories.append(history)
    return histories, models



 '''
    for f, (trn_ind, val_ind) in enumerate(kfold.split(TRAINING_FILENAMES)[0]):
        print(); print('#'*25)
        print('### FOLD',f+1)
        print('#'*25)
        train_dataset = load_dataset(list(pd.DataFrame({'TRAINING_FILENAMES': TRAINING_FILENAMES}).loc[trn_ind]['TRAINING_FILENAMES']), labeled = True)
        val_dataset = load_dataset(list(pd.DataFrame({'TRAINING_FILENAMES': TRAINING_FILENAMES}).loc[val_ind]['TRAINING_FILENAMES']), labeled = True, ordered = True)
        model = get_model()
        history = model.fit(
            get_training_dataset(train_dataset), 
            steps_per_epoch = STEPS_PER_EPOCH,
            epochs = EPOCHS,
            callbacks = [lr_callback],#, early_stopping],
            validation_data = get_validation_dataset(val_dataset),
            verbose=2
        )
        models.append(model)
        histories.append(history)
    return histories, models
'''

def train_and_predict(folds = 5):
    test_ds = get_test_dataset(ordered=True) # since we are splitting the dataset and iterating separately on images and ids, order matters.
    test_images_ds = test_ds.map(lambda image, idnum: image)
    print('Start training %i folds'%folds)
    histories, models = train_cross_validate(folds = folds)
    print('Computing predictions...')
    # get the mean probability of the folds models
    probabilities = np.average([models[0].predict(test_images_ds)], axis = 0)
    predictions = np.argmax(probabilities, axis=-1)
    print('Generating submission.csv file...')
    test_ids_ds = test_ds.map(lambda image, idnum: idnum).unbatch()
    test_ids = next(iter(test_ids_ds.batch(NUM_TEST_IMAGES))).numpy().astype('U') # all in one batch
    np.savetxt('submission.csv', np.rec.fromarrays([test_ids, predictions]), fmt=['%s', '%d'], delimiter=',', header='id,label', comments='')
    return histories, models
    
# run train and predict
histories, models = train_and_predict(folds = FOLDS)

终端输出部分

Start training 10 folds

#########################
### FOLD 1
#########################
/opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_split.py:297: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.
  FutureWarning
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/densenet/densenet201_weights_tf_dim_ordering_tf_kernels_notop.h5
74842112/74836368 [==============================] - 2s 0us/step

Epoch 00001: LearningRateScheduler reducing learning rate to 1e-05.
Epoch 1/19
---------------------------------------------------------------------------
UnimplementedError                        Traceback (most recent call last)
<ipython-input-14-869733d1f23c> in <module>
     90 
     91 # run train and predict
---> 92 histories, models = train_and_predict(folds = FOLDS)

<ipython-input-14-869733d1f23c> in train_and_predict(folds)
     78     test_images_ds = test_ds.map(lambda image, idnum: image)
     79     print('Start training %i folds'%folds)
---> 80     histories, models = train_cross_validate(folds = folds)
     81     print('Computing predictions...')
     82     # get the mean probability of the folds models

<ipython-input-14-869733d1f23c> in train_cross_validate(folds)
     45         callbacks = [lr_callback],#, early_stopping],
     46         validation_data = get_validation_dataset(val_dataset),
---> 47         verbose=2
     48     )
     49     models.append(model)

/opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py in _method_wrapper(self, *args, **kwargs)
     64   def _method_wrapper(self, *args, **kwargs):
     65     if not self._in_multi_worker_mode():  # pylint: disable=protected-access
---> 66       return method(self, *args, **kwargs)
     67 
     68     # Running inside `run_distribute_coordinator` already.

/opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
    853                 context.async_wait()
    854               logs = tmp_logs  # No error, now safe to assign to logs.
--> 855               callbacks.on_train_batch_end(step, logs)
    856         epoch_logs = copy.copy(logs)
    857 

/opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/callbacks.py in on_train_batch_end(self, batch, logs)
    387     """
    388     if self._should_call_train_batch_hooks:
--> 389       logs = self._process_logs(logs)
    390       self._call_batch_hook(ModeKeys.TRAIN, 'end', batch, logs=logs)
    391 

/opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/callbacks.py in _process_logs(self, logs)
    263     """Turns tensors into numpy arrays or Python scalars."""
    264     if logs:
--> 265       return tf_utils.to_numpy_or_python_type(logs)
    266     return {}
    267 

/opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/utils/tf_utils.py in to_numpy_or_python_type(tensors)
    521     return t  # Don't turn ragged or sparse tensors to NumPy.
    522 
--> 523   return nest.map_structure(_to_single_numpy_or_python_type, tensors)
    524 

/opt/conda/lib/python3.7/site-packages/tensorflow/python/util/nest.py in map_structure(func, *structure, **kwargs)
    615 
    616   return pack_sequence_as(
--> 617       structure[0], [func(*x) for x in entries],
    618       expand_composites=expand_composites)
    619 

/opt/conda/lib/python3.7/site-packages/tensorflow/python/util/nest.py in <listcomp>(.0)
    615 
    616   return pack_sequence_as(
--> 617       structure[0], [func(*x) for x in entries],
    618       expand_composites=expand_composites)
    619 

/opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/utils/tf_utils.py in _to_single_numpy_or_python_type(t)
    517   def _to_single_numpy_or_python_type(t):
    518     if isinstance(t, ops.Tensor):
--> 519       x = t.numpy()
    520       return x.item() if np.ndim(x) == 0 else x
    521     return t  # Don't turn ragged or sparse tensors to NumPy.

/opt/conda/lib/python3.7/site-packages/tensorflow/python/framework/ops.py in numpy(self)
    959     """
    960     # TODO(slebedev): Consider avoiding a copy for non-CPU or remote tensors.
--> 961     maybe_arr = self._numpy()  # pylint: disable=protected-access
    962     return maybe_arr.copy() if isinstance(maybe_arr, np.ndarray) else maybe_arr
    963 

/opt/conda/lib/python3.7/site-packages/tensorflow/python/framework/ops.py in _numpy(self)
    927       return self._numpy_internal()
    928     except core._NotOkStatusException as e:
--> 929       six.raise_from(core._status_to_exception(e.code, e.message), None)
    930 
    931   @property

/opt/conda/lib/python3.7/site-packages/six.py in raise_from(value, from_value)

UnimplementedError: 7 root error(s) found.
  (0) Unimplemented: {{function_node __inference_train_function_259513}} File system scheme '[local]' not implemented (file: 'd')
	 [[{{node MultiDeviceIteratorGetNextFromShard}}]]
	 [[RemoteCall]]
	 [[IteratorGetNextAsOptional_6]]
  (1) Cancelled: {{function_node __inference_train_function_259513}} Function was cancelled before it was started
  (2) Cancelled: {{function_node __inference_train_function_259513}} Function was cancelled before it was started
  (3) Cancelled: {{function_node __inference_train_function_259513}} Function was cancelled before it was started
  (4) Cancelled: {{function_node __inference_train_function_259513}} Function was cancelled before it was started
  (5) Cancelled: {{function_node __inference_train_function_259513}} Function was cancelled before it was started
  (6) Cancelled: {{function_node __inference_train_function_259513}} Function was cancelled before it was started
0 successful operations.
2 derived errors ignored.