问题描述:
在 Mindspore 中使用自定义的 RNN 模型训练文本分类任务中,使用 mindspore.dataset.GeneratorDate 对于自定义个数据集进行包装的时候,利用 mindspore.Model.train() 来训练的时候报错
ValueError: The data pipeline is not a tree (i.e., one node has 2 consumers)
查询了论坛中的相关回答,没有出现过对于相同的 dataset.method() 赋值给两个实例的情况,不是很清楚是什么情况。
尝试跟了一下跟 shell 中的报错,跟到了 Dataset 这个抽象类的 parser_tree 就搞不懂了,没有搞清楚 children 里面是怎么生成的
Mindspore 是在动态图环境中运行的
context.set_context(mode=context.PYNATIVE_MODE,
device_target="GPU"
机器环境为:
mindspore = 1.6
cuda = 11.1
os = ubuntu-20.04
简洁的代码流程
data_loader = DataLoader(args)
tok = lambda x: jieba.lcut(x, cut_all=False)
train_x, train_y, dev_x, dev_y, test_x = data_loader.get_dateSet(tok)
train_gen = DataSetGenerator(train_x, train_y)
train_dataset = mds.GeneratorDataset(train_gen,
shuffle=True,
column_names=['text', 'label'])
train_dataset.batch(batch_size=args.batch_size)
rnn = TextRNN(data_loader.data_size, 200, 10)
critirition = nn.SoftmaxCrossEntropyWithLogits()
opt = nn.Adam(rnn.trainable_params(), learning_rate=args.lr)
model = Model(rnn, critirition, opt)
model.train(train_dataset=train_dataset, epoch=args.epochs
其中自定义数据集为简单的一个可迭代的对象
完整的信息
自定义模型
class TextRNN(nn.Cell):
"""
x => rnn_block => dropout => fc
"""
def __init__(self,
n_class,
n_hidden,
num_out,
num_direction=1,
layer_num=1,
dropout=0.5):
super(TextRNN, self).__init__()
self.hide_size = n_hidden
self.n_dirct = num_direction
self.n_layer = layer_num
self.dropout = dropout
bi_direct = True if self.n_dirct == 2 else False
self.rnn = nn.RNN(input_size=n_class,
hidden_size=n_hidden,
batch_first=True,
bidirectional=bi_direct)
# self.prev_h = Tensor(np.zeros((1, n_hidden)).astype(np.float32))
self.fc = nn.Dense(n_hidden * self.n_dirct,
num_out,
activation=nn.Sigmoid())
self.dp = nn.Dropout(keep_prob=self.dropout)
self.slice = ops.Slice()
self.squeeze = ops.Squeeze()
def construct(self, x, seq_len):
"""
Input:
x: (batch, time_step, input_size)
seq_len: (batch)
"""
batch_size = x.shape[0]
hx = Tensor(
np.ones((self.n_dirct * self.n_layer, batch_size,
self.hide_size)).astype(np.float32))
rnn_out, _ = self.rnn(
x, hx) # (batch, time_step, num_directions*hidden_size)
last_hdState = self.slice(
rnn_out, (0, seq_len - 1, 0),
(-1, seq_len, -1)) # (batch, 1, num_directions*hidden_size)
last_hdState = self.squeeze(
last_hdState, axis=1) # (batch, num_directions*hidden_size)
dropout = self.dp(last_hdState)
out = self.fc(dropout)
return out
训练文件
class DataLoader:
"""
DataLoader for rnn
"""
def __init__(self, args, max_seqLen=20):
self.data_path = args.data_path
self.max_seqLen = max_seqLen
self.train_file = os.path.join(args.data_path, 'train.txt')
self.dev_file = os.path.join(args.data_path, 'dev.txt')
self.text_file = os.path.join(args.data_path, 'test.txt')
self.vocab_file = os.path.join(args.data_path, 'vocab.txt')
self.embed_file = os.path.join(args.data_path, 'embed.npy')
# prepare word2idx, and idx2embed
vocab = []
with open(self.vocab_file, 'r') as fr:
for line in fr:
vocab.extend(line.strip().split())
vocab = list(set(vocab))
self._vocab_size = len(vocab)
self.word2idx = {w: i for i, w in enumerate(vocab)}
del vocab
self.word2idx['<UNK>'] = self._vocab_size
self.word2idx['<PAD>'] = self._vocab_size + 1
self.idx2embed = np.load(self.embed_file)
@property
def vocab_size(self):
return self._vocab_size
@property
def data_size(self):
return self.max_seqLen * self.idx2embed.shape[1]
def get_dateSet(self, tok):
"""
return train_set, dev_set, test_set
"""
def load_data(file, tok, word2idx, pad_size=None):
is_test = "test" in file
print(set_color(f"Loading {file}...", COLOR.GREEN))
data = []
with open(file, 'r') as fr:
for line in tqdm(fr):
tmp = []
if is_test:
text = line.strip()
label = -1
else:
text, label = line.strip().split('\t')
assert text != None and label != None
text = tok(text)
for word in text:
tmp.append(word2idx.get(word, word2idx['<UNK>']))
if pad_size:
if len(tmp) > pad_size:
tmp = tmp[:pad_size]
else:
tmp.extend([word2idx['<PAD>']] *
(pad_size - len(tmp)))
data.append([tmp, int(label)])
return data
def load_embed(idx_list, idx2embed, have_label=True):
get_embed = lambda x: [idx2embed[i] for i in x]
embeds = []
labels = []
for indics in idx_list:
if have_label:
labels.append(indics[-1])
embeds.append(get_embed(indics[:-1])[0].flatten())
return embeds, labels
train_set = load_data(self.train_file, tok, self.word2idx,
self.max_seqLen)
dev_set = load_data(self.dev_file, tok, self.word2idx, self.max_seqLen)
test_set = load_data(self.text_file, tok, self.word2idx,
self.max_seqLen)
train_x, train_y = load_embed(train_set,
self.idx2embed,
have_label=True)
dev_x, dev_y = load_embed(dev_set, self.idx2embed, have_label=True)
test_x, _ = load_embed(test_set, self.idx2embed, have_label=False)
return train_x, train_y, dev_x, dev_y, test_x
class DataSetGenerator:
def __init__(self, text, labels, isTest=False):
self.text = text
self.labels = labels
self.isTest = isTest
def __getitem__(self, idx):
label = self.labels[idx]
data = self.text[idx]
return data, label
def __len__(self):
return len(self.text)
if __name__ == "__main__":
args = args_init()
context.set_context(mode=context.PYNATIVE_MODE,
device_target=args.device_target)
print(set_color(f"Loading data...", COLOR.GREEN))
data_loader = DataLoader(args)
tok = lambda x: jieba.lcut(x, cut_all=False)
train_x, train_y, dev_x, dev_y, test_x = data_loader.get_dateSet(tok)
train_gen = DataSetGenerator(train_x, train_y)
dev_gen = DataSetGenerator(dev_x, dev_y)
test_gen = DataSetGenerator(test_x, None, isTest=True)
train_dataset = mds.GeneratorDataset(train_gen,
shuffle=True,
column_names=['text', 'label'])
train_dataset.batch(batch_size=args.batch_size)
rnn = TextRNN(data_loader.data_size, 200, 10)
critirition = nn.SoftmaxCrossEntropyWithLogits()
opt = nn.Adam(rnn.trainable_params(), learning_rate=args.lr)
print(set_color(f"Start training...", COLOR.GREEN))
model = Model(rnn, critirition, opt)
model.train(train_dataset=train_dataset, epoch=args.epochs)
print(set_color(f"Good Job...", COLOR.GREEN))
shell 中信息
(mindspore) ➜ nlp-hw2-DL git:(master) ✗ python train/train_rnn.py
Loading data...
Loading data/train.txt...
0it [00:00, ?it/s]Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.516 seconds.
Prefix dict has been built successfully.
180000it [00:11, 15736.50it/s]
Loading data/dev.txt...
10000it [00:00, 17419.34it/s]
Loading data/test.txt...
10000it [00:00, 17244.95it/s]
<class 'numpy.ndarray'> <class 'int'> (4000,)
Start training...
Traceback (most recent call last):
File "/home/leelin/Project/nlp-hw2-DL/train/train_rnn.py", line 159, in <module>
model.train(train_dataset=train_dataset, epoch=args.epochs)
File "/home/leelin/anaconda3/envs/mindspore/lib/python3.9/site-packages/mindspore/train/model.py", line 770, in train
self._train(epoch,
File "/home/leelin/anaconda3/envs/mindspore/lib/python3.9/site-packages/mindspore/train/model.py", line 87, in wrapper
func(self, *args, **kwargs)
File "/home/leelin/anaconda3/envs/mindspore/lib/python3.9/site-packages/mindspore/train/model.py", line 540, in _train
self._train_dataset_sink_process(epoch, train_dataset, list_callback, cb_params, sink_size)
File "/home/leelin/anaconda3/envs/mindspore/lib/python3.9/site-packages/mindspore/train/model.py", line 590, in _train_dataset_sink_process
dataset_helper, train_network = self._exec_preprocess(is_train=True,
File "/home/leelin/anaconda3/envs/mindspore/lib/python3.9/site-packages/mindspore/train/model.py", line 391, in _exec_preprocess
dataset_helper = DatasetHelper(dataset, dataset_sink_mode, sink_size, epoch_num)
File "/home/leelin/anaconda3/envs/mindspore/lib/python3.9/site-packages/mindspore/train/dataset_helper.py", line 294, in __init__
self.iter = iterclass(dataset, sink_size, epoch_num)
File "/home/leelin/anaconda3/envs/mindspore/lib/python3.9/site-packages/mindspore/train/dataset_helper.py", line 465, in __init__
super().__init__(dataset, sink_size, epoch_num)
File "/home/leelin/anaconda3/envs/mindspore/lib/python3.9/site-packages/mindspore/train/dataset_helper.py", line 387, in __init__
_send_data(dataset, epoch_num)
File "/home/leelin/anaconda3/envs/mindspore/lib/python3.9/site-packages/mindspore/train/dataset_helper.py", line 32, in _send_data
exec_dataset.send(epoch_num)
File "/home/leelin/anaconda3/envs/mindspore/lib/python3.9/site-packages/mindspore/dataset/engine/validators.py", line 1873, in new_method
return method(self, *args, **kwargs)
File "/home/leelin/anaconda3/envs/mindspore/lib/python3.9/site-packages/mindspore/dataset/engine/datasets.py", line 3499, in send
self._to_device = _ToDevice(self, num_epochs)
File "/home/leelin/anaconda3/envs/mindspore/lib/python3.9/site-packages/mindspore/dataset/engine/datasets.py", line 3389, in __init__
ir_tree, self.api_tree = dataset.create_ir_tree()
File "/home/leelin/anaconda3/envs/mindspore/lib/python3.9/site-packages/mindspore/dataset/engine/datasets.py", line 310, in create_ir_tree
ir_tree = dataset.parse_tree()
File "/home/leelin/anaconda3/envs/mindspore/lib/python3.9/site-packages/mindspore/dataset/engine/datasets.py", line 398, in parse_tree
ir_children = [d.parse_tree() for d in self.children]
File "/home/leelin/anaconda3/envs/mindspore/lib/python3.9/site-packages/mindspore/dataset/engine/datasets.py", line 398, in <listcomp>
ir_children = [d.parse_tree() for d in self.children]
File "/home/leelin/anaconda3/envs/mindspore/lib/python3.9/site-packages/mindspore/dataset/engine/datasets.py", line 397, in parse_tree
raise ValueError("The data pipeline is not a tree (i.e., one node has 2 consumers)")
ValueError: The data pipeline is not a tree (i.e., one node has 2 consumers)
解答:
你好,这里忘了赋值了 建议改成 train_dataset = mds.GeneratorDataset(...) train_dataset = train_dataset.batch(...)