目录
使用GPU物理机器(Linux)跑模型+修改数据集
前言:
分别在物理机和镜像中跑bert模型。
参考链接
30分钟带你彻底掌握Bert源码(Pytorch),超详细!!不看后悔!! - 知乎
原作者讲的很好,写这篇我就不复述原作者的东西,我写一些我遇到的问题和如何解决的
代码来源
https://github.com/ChineseGLUE/ChineseGLUE/tree/master/baselines/models_pytorch/classifier_pytorch
下载这里:
ChineseGLUE/baselines/models_pytorch/classifier_pytorch/
使用GPU物理机器(Linux)跑模型
1. 将下载的chines模型转化为python的:
export BERT_BASE_DIR=/chinese_L-12_H-768_A-12
运行完后:
2.把转换好的模型放到代码文件夹下
3.下载tnews数据集
https://github.com/CLUEbenchmark/CLUE
4.更改Dataprocessor
如果要用新的数据集,则建议写一个自己的Dataprocessor,例如:
在/home/bert-step/classifier_pytorch/processors/中glue.py,打开编辑,写了一个自己的Processors(ymProcessor):
为了代码短一点,我删掉了其他的Processor。
""" GLUE processors and helpers """
import logging
import os
import torch
from .utils import DataProcessor, InputExample, InputFeatures
import tensorflow as tf
logger = logging.getLogger(__name__)
import json
import six
def collate_fn(batch):
"""
batch should be a list of (sequence, target, length) tuples...
Returns a padded tensor of sequences sorted from longest to shortest,
"""
all_input_ids, all_attention_mask, all_token_type_ids, all_lens, all_labels = map(torch.stack, zip(*batch))
max_len = max(all_lens).item()
all_input_ids = all_input_ids[:, :max_len]
all_attention_mask = all_attention_mask[:, :max_len]
all_token_type_ids = all_token_type_ids[:, :max_len]
return all_input_ids, all_attention_mask, all_token_type_ids, all_labels
def xlnet_collate_fn(batch):
"""
batch should be a list of (sequence, target, length) tuples...
Returns a padded tensor of sequences sorted from longest to shortest,
"""
all_input_ids, all_attention_mask, all_token_type_ids, all_lens, all_labels = map(torch.stack, zip(*batch))
max_len = max(all_lens).item()
all_input_ids = all_input_ids[:, -max_len:]
all_attention_mask = all_attention_mask[:, -max_len:]
all_token_type_ids = all_token_type_ids[:, -max_len:]
return all_input_ids, all_attention_mask, all_token_type_ids, all_labels
def glue_convert_examples_to_features(examples, tokenizer,
max_length=512,
task=None,
label_list=None,
output_mode=None,
pad_on_left=False,
pad_token=0,
pad_token_segment_id=0,
mask_padding_with_zero=True):
if task is not None:
processor = glue_processors[task]()
if label_list is None:
label_list = processor.get_labels()
logger.info("Using label list %s for task %s" % (label_list, task))
if output_mode is None:
output_mode = glue_output_modes[task]
logger.info("Using output mode %s for task %s" % (output_mode, task))
label_map = {label: i for i, label in enumerate(label_list)}
features = []
for (ex_index, example) in enumerate(examples):
if ex_index % 10000 == 0:
logger.info("Writing example %d" % (ex_index))
inputs = tokenizer.encode_plus(
example.text_a,
example.text_b,
add_special_tokens=True,
max_length=max_length
)
input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
input_len = len(input_ids)
# Zero-pad up to the sequence length.
padding_length = max_length - len(input_ids)
if pad_on_left:
input_ids = ([pad_token] * padding_length) + input_ids
attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
else:
input_ids = input_ids + ([pad_token] * padding_length)
attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask),
max_length)
assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids),
max_length)
if output_mode == "classification":
label = label_map[example.label]
elif output_mode == "regression":
label = float(example.label)
else:
raise KeyError(output_mode)
if ex_index < 5:
logger.info("*** Example ***")
logger.info("guid: %s" % (example.guid))
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
logger.info("label: %s (id = %d)" % (example.label, label))
logger.info("input length: %d" % (input_len))
features.append(
InputFeatures(input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
label=label,
input_len=input_len))
return features
def convert_to_unicode(text):
if six.PY3:
if isinstance(text,str):
return text
elif isinstance(text,bytes):
return text.decode("utf-8","ignore")
else:
raise ValueError("Unsupported string type : %s" % (type(text)))
elif six.PY2:
if isinstance(text,str):
return text.decode("utf-8","ignore")
elif isinstance(text,unicode):
return text
else:
raise ValueError("Unsupported string type : %s" % (type(text)))
else:
raise ValueError("Not running on Python2 or Python3?")
class ymProcessor(DataProcessor):
"""Processor for the SST-2 data set (GLUE version)."""
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_txt(os.path.join(data_dir, "train.txt")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_txt(os.path.join(data_dir, "dev.txt")), "dev")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_txt(os.path.join(data_dir, "test.txt")), "test")
def get_labels(self):
"""See base class."""
return [0,1]
def _create_examples(self, file_name, set_type):
"""Creates examples for the training and dev sets."""
examples = []
lines=tf.gfile.Open(file_name,"r")
index=0
for line in lines:
line_obj=json.loads(line)
index =index +1
guid = "%s-%s" % (set_type, index)
text_a = convert_to_unicode(line_obj["text"])
label=convert_to_unicode(line_obj["labels"])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
return examples
glue_tasks_num_labels = {
"mnli": 3,
"mrpc": 2,
"sst-2": 2,
"sts-b": 1,
"qqp": 2,
"qnli": 2,
"rte": 2,
"xnli": 3,
'tnews': 15,
'lcqmc': 2,
'inews': 3,
"ym" : 2,
}
glue_processors = {
"cola": ColaProcessor,
"mnli": MnliProcessor,
"mnli-mm": MnliMismatchedProcessor,
"mrpc": MrpcProcessor,
"sst-2": Sst2Processor,
"sts-b": StsbProcessor,
"qqp": QqpProcessor,
"qnli": QnliProcessor,
"rte": RteProcessor,
"wnli": WnliProcessor,
'tnews': TnewsProcessor,
'xnli': XnliProcessor,
'lcqmc': LcqmcProcessor,
'inews': InewsProcessor,
"ym": ymProcessor,
}
glue_output_modes = {
"cola": "classification",
"mnli": "classification",
"mnli-mm": "classification",
"mrpc": "classification",
"sst-2": "classification",
"sts-b": "regression",
"qqp": "classification",
"qnli": "classification",
"rte": "classification",
"wnli": "classification",
'tnews': "classification",
'xnli': "classification",
'lcqmc': "classification",
'inews': "classification",
"ym" : "classification",
}
5. 遇到错误:
原因,txt后面多了空行,删掉即可解决。
6. 使用GPU跑起来:
耗时:GPU跑不到一个小时(数据集中三个数据各只有一千条数据的情况)
另,如果使用了新的数据集,写了自己的Processor,则运行指令为:
python run_classifier.py --model_type=bert --model_name_or_path=/usr/local/bert-test --task_name="ym" --do_train --do_eval --do_lower_case --data_dir=./chineseGLUEdatasets/ym --max_seq_length=128 --per_gpu_train_batch_size=16 --per_gpu_eval_batch_size=16 --learning_rate=2e-5 --num_train_epochs=4.0 --logging_steps=100 --save_steps=100 --output_dir=./outputs/tnews_output/ --overwrite_output_dir
在镜像中跑模型
1.构建算法镜像
把预训练模型和算法都放进一个文件夹里
Dockerfile:
构建dockerfile制作镜像:
sudo docker build –t test/bert-step:v1.2.0-dev ./
-t后面镜像名称自定义就行。
2. 运行docker
sudo docker run -it --rm test/bert-step:v1.2.0-dev
运行后就进入了镜像内部,可以使用ls查看镜像内的文件。
3. 在镜像中跑模型
运行指令:
python run_classifier.py --model_type=bert --model_name_or_path=/home/mao.yang/bert-base-chinese --task_name="tnews" --do_train --do_eval --do_lower_case --data_dir=./chineseGLUEdatasets/tnews --max_seq_length=128 --per_gpu_train_batch_size=16 --per_gpu_eval_batch_size=16 --learning_rate=2e-5 --num_train_epochs=4.0 --logging_steps=100 --save_steps=100 --output_dir=./outputs/tnews_output/ --overwrite_output_dir
// 注意更改data_dir的位置
运行过程:
运行成功,没有启动GPU的情况下(指令使用的是CPU),运行时间大约4个小时。
使用GPU跑,不到一小时就跑完