import tensorflow as tf
import glob
import os
from google.protobuf.json_format import MessageToJson
flags = tf.flags
FLAGS = flags.FLAGS
flags.DEFINE_string(
"file_path", None,
"file_path for check")
flags.DEFINE_string(
"pattern", "*.tfrecord",
"file pattern for check")
def validate_dataset(filenames, reader_opts=None):
"""
Attempt to iterate over every record in the supplied iterable of TFRecord filenames
:param filenames: iterable of filenames to read
:param reader_opts: (optional) tf.python_io.TFRecordOptions to use when constructing the record iterator
"""
i = 0
for fname in filenames:
print('validating ', fname)
record_iterator = tf.python_io.tf_record_iterator(path=fname, options=reader_opts)
try:
for record in record_iterator:
jsonMessage = MessageToJson(tf.train.Example.FromString(record))
print(jsonMessage) #["features"]["feature"]["input_ids"]
i += 1
except Exception as e:
print('error in {} at record {}'.format(fname, i))
print(e)
#os.remove(fname)
print("%s has examples: %d"%(FLAGS.pattern, i))
def main(_):
file_path = FLAGS.file_path #"/Users/eunicechen1987/working/codes/BERT/bert-google-master/pre_train_data"
pattern = FLAGS.pattern
file_list = glob.glob(os.path.join(file_path, pattern))
validate_dataset(file_list)
if __name__ == '__main__':
tf.app.run()