class SimProcessor(DataProcessor):
def __init__(self):
self.language = "zh"
def get_train_examples(self, data_dir):
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
def get_test_examples(self, data_dir):
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
def get_labels(self):
return ["0", "1"]
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, i)
label = tokenization.convert_to_unicode(line[2])
text_a = tokenization.convert_to_unicode(line[0])
text_b = tokenization.convert_to_unicode(line[1])
examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
if set_type == "test":
label = "1"
else:
label = tokenization.convert_to_unicode(line[2])
examples.append(InputExample(guid=guid, text_a=text_a, label=label))
return examples
export BERT_BASE_DIR=/deep_docs/deeps/chinese_L-12_H-768_A-12
export DATA_DIR=/deep_docs/deeps/data/sim
export OUTPUT_DIR=/deep_docs/deeps/output
export EXP_NAME=sim
CUDA_VISIBLE_DEVICES=3 python run_classifier.py \
--task_name=sim \
--do_train=true \
--do_eval=true \
--do_predict=true \
--data_dir=$DATA_DIR\
--vocab_file=$BERT_BASE_DIR/vocab.txt \
--bert_config_file=$BERT_BASE_DIR/bert_config.json \
--init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
--max_seq_length=16 \
--train_batch_size=32 \
--learning_rate=2e-5 \
--num_train_epochs=5.0 \
--output_dir=$OUTPUT_DIR/$EXP_NAME