mindspore自定义数据集ValueError: The data pipeline is not a tree

问题描述:

在 Mindspore 中使用自定义的 RNN 模型训练文本分类任务中,使用 mindspore.dataset.GeneratorDate 对于自定义个数据集进行包装的时候,利用 mindspore.Model.train() 来训练的时候报错

ValueError: The data pipeline is not a tree (i.e., one node has 2 consumers)

查询了论坛中的相关回答,没有出现过对于相同的 dataset.method() 赋值给两个实例的情况,不是很清楚是什么情况。

尝试跟了一下跟 shell 中的报错,跟到了 Dataset 这个抽象类的 parser_tree 就搞不懂了,没有搞清楚 children 里面是怎么生成的

Mindspore 是在动态图环境中运行的

context.set_context(mode=context.PYNATIVE_MODE,

                        device_target="GPU"

机器环境为:

mindspore = 1.6

cuda = 11.1

os = ubuntu-20.04

简洁的代码流程

    data_loader = DataLoader(args)

    tok = lambda x: jieba.lcut(x, cut_all=False)

    train_x, train_y, dev_x, dev_y, test_x = data_loader.get_dateSet(tok)

    train_gen = DataSetGenerator(train_x, train_y)

    train_dataset = mds.GeneratorDataset(train_gen,

                                         shuffle=True,

                                         column_names=['text', 'label'])

    train_dataset.batch(batch_size=args.batch_size)

    rnn = TextRNN(data_loader.data_size, 200, 10)

    critirition = nn.SoftmaxCrossEntropyWithLogits()

    opt = nn.Adam(rnn.trainable_params(), learning_rate=args.lr)

    model = Model(rnn, critirition, opt)

    model.train(train_dataset=train_dataset, epoch=args.epochs

其中自定义数据集为简单的一个可迭代的对象

完整的信息

自定义模型

class TextRNN(nn.Cell):

    """

        x => rnn_block => dropout => fc

    """

    def __init__(self,

                 n_class,

                 n_hidden,

                 num_out,

                 num_direction=1,

                 layer_num=1,

                 dropout=0.5):

        super(TextRNN, self).__init__()

        self.hide_size = n_hidden

        self.n_dirct = num_direction

        self.n_layer = layer_num

        self.dropout = dropout

        bi_direct = True if self.n_dirct == 2 else False

        self.rnn = nn.RNN(input_size=n_class,

                          hidden_size=n_hidden,

                          batch_first=True,

                          bidirectional=bi_direct)

        # self.prev_h = Tensor(np.zeros((1, n_hidden)).astype(np.float32))

        self.fc = nn.Dense(n_hidden * self.n_dirct,

                           num_out,

                           activation=nn.Sigmoid())

        self.dp = nn.Dropout(keep_prob=self.dropout)

        self.slice = ops.Slice()

        self.squeeze = ops.Squeeze()

    def construct(self, x, seq_len):

        """

            Input:

                x: (batch, time_step, input_size)

                seq_len: (batch)

        """

        batch_size = x.shape[0]

        hx = Tensor(

            np.ones((self.n_dirct * self.n_layer, batch_size,

                      self.hide_size)).astype(np.float32))

        rnn_out, _ = self.rnn(

            x, hx)  # (batch, time_step, num_directions*hidden_size)

        last_hdState = self.slice(

            rnn_out, (0, seq_len - 1, 0),

            (-1, seq_len, -1))  # (batch, 1, num_directions*hidden_size)

        last_hdState = self.squeeze(

            last_hdState, axis=1)  # (batch, num_directions*hidden_size)

        dropout = self.dp(last_hdState)

        out = self.fc(dropout)

        return out

训练文件

class DataLoader:

    """

        DataLoader for rnn

    """

    def __init__(self, args, max_seqLen=20):

        self.data_path = args.data_path

        self.max_seqLen = max_seqLen

        self.train_file = os.path.join(args.data_path, 'train.txt')

        self.dev_file = os.path.join(args.data_path, 'dev.txt')

        self.text_file = os.path.join(args.data_path, 'test.txt')

        self.vocab_file = os.path.join(args.data_path, 'vocab.txt')

        self.embed_file = os.path.join(args.data_path, 'embed.npy')

        # prepare word2idx, and idx2embed

        vocab = []

        with open(self.vocab_file, 'r') as fr:

            for line in fr:

                vocab.extend(line.strip().split())

        vocab = list(set(vocab))

        self._vocab_size = len(vocab)

        self.word2idx = {w: i for i, w in enumerate(vocab)}

        del vocab

        self.word2idx['<UNK>'] = self._vocab_size

        self.word2idx['<PAD>'] = self._vocab_size + 1

        self.idx2embed = np.load(self.embed_file)

    @property

    def vocab_size(self):

        return self._vocab_size

    @property

    def data_size(self):

        return self.max_seqLen * self.idx2embed.shape[1]

    def get_dateSet(self, tok):

        """

            return train_set, dev_set, test_set

        """

        def load_data(file, tok, word2idx, pad_size=None):

            is_test = "test" in file

            print(set_color(f"Loading {file}...", COLOR.GREEN))

            data = []

            with open(file, 'r') as fr:

                for line in tqdm(fr):

                    tmp = []

                    if is_test:

                        text = line.strip()

                        label = -1

                    else:

                        text, label = line.strip().split('\t')

                    assert text != None and label != None

                    text = tok(text)

                    for word in text:

                        tmp.append(word2idx.get(word, word2idx['<UNK>']))

                    if pad_size:

                        if len(tmp) > pad_size:

                            tmp = tmp[:pad_size]

                        else:

                            tmp.extend([word2idx['<PAD>']] *

                                       (pad_size - len(tmp)))

                    data.append([tmp, int(label)])

            return data

        def load_embed(idx_list, idx2embed, have_label=True):

            get_embed = lambda x: [idx2embed[i] for i in x]

            embeds = []

            labels = []

            for indics in idx_list:

                if have_label:

                    labels.append(indics[-1])

                embeds.append(get_embed(indics[:-1])[0].flatten())

            return embeds, labels

        train_set = load_data(self.train_file, tok, self.word2idx,

                              self.max_seqLen)

        dev_set = load_data(self.dev_file, tok, self.word2idx, self.max_seqLen)

        test_set = load_data(self.text_file, tok, self.word2idx,

                             self.max_seqLen)

        train_x, train_y = load_embed(train_set,

                                      self.idx2embed,

                                      have_label=True)

        dev_x, dev_y = load_embed(dev_set, self.idx2embed, have_label=True)

        test_x, _ = load_embed(test_set, self.idx2embed, have_label=False)

        return train_x, train_y, dev_x, dev_y, test_x

class DataSetGenerator:

    def __init__(self, text, labels, isTest=False):

        self.text = text

        self.labels = labels

        self.isTest = isTest

    def __getitem__(self, idx):

        label = self.labels[idx]

        data = self.text[idx]

        return data, label

    def __len__(self):

        return len(self.text)

if __name__ == "__main__":

    args = args_init()

    context.set_context(mode=context.PYNATIVE_MODE,

                        device_target=args.device_target)

    print(set_color(f"Loading data...", COLOR.GREEN))

    data_loader = DataLoader(args)

    tok = lambda x: jieba.lcut(x, cut_all=False)

    train_x, train_y, dev_x, dev_y, test_x = data_loader.get_dateSet(tok)

    train_gen = DataSetGenerator(train_x, train_y)

    dev_gen = DataSetGenerator(dev_x, dev_y)

    test_gen = DataSetGenerator(test_x, None, isTest=True)

    train_dataset = mds.GeneratorDataset(train_gen,

                                         shuffle=True,

                                         column_names=['text', 'label'])

    train_dataset.batch(batch_size=args.batch_size)

    rnn = TextRNN(data_loader.data_size, 200, 10)

    critirition = nn.SoftmaxCrossEntropyWithLogits()

    opt = nn.Adam(rnn.trainable_params(), learning_rate=args.lr)

    print(set_color(f"Start training...", COLOR.GREEN))

    model = Model(rnn, critirition, opt)

    model.train(train_dataset=train_dataset, epoch=args.epochs)

    print(set_color(f"Good Job...", COLOR.GREEN))

shell 中信息

(mindspore) ➜  nlp-hw2-DL git:(master) ✗ python train/train_rnn.py

Loading data...

Loading data/train.txt...

0it [00:00, ?it/s]Building prefix dict from the default dictionary ...

Loading model from cache /tmp/jieba.cache

Loading model cost 0.516 seconds.

Prefix dict has been built successfully.

180000it [00:11, 15736.50it/s]

Loading data/dev.txt...

10000it [00:00, 17419.34it/s]

Loading data/test.txt...

10000it [00:00, 17244.95it/s]

<class 'numpy.ndarray'> <class 'int'> (4000,)

Start training...

Traceback (most recent call last):

  File "/home/leelin/Project/nlp-hw2-DL/train/train_rnn.py", line 159, in <module>

    model.train(train_dataset=train_dataset, epoch=args.epochs)

  File "/home/leelin/anaconda3/envs/mindspore/lib/python3.9/site-packages/mindspore/train/model.py", line 770, in train

    self._train(epoch,

  File "/home/leelin/anaconda3/envs/mindspore/lib/python3.9/site-packages/mindspore/train/model.py", line 87, in wrapper

    func(self, *args, **kwargs)

  File "/home/leelin/anaconda3/envs/mindspore/lib/python3.9/site-packages/mindspore/train/model.py", line 540, in _train

    self._train_dataset_sink_process(epoch, train_dataset, list_callback, cb_params, sink_size)

  File "/home/leelin/anaconda3/envs/mindspore/lib/python3.9/site-packages/mindspore/train/model.py", line 590, in _train_dataset_sink_process

    dataset_helper, train_network = self._exec_preprocess(is_train=True,

  File "/home/leelin/anaconda3/envs/mindspore/lib/python3.9/site-packages/mindspore/train/model.py", line 391, in _exec_preprocess

    dataset_helper = DatasetHelper(dataset, dataset_sink_mode, sink_size, epoch_num)

  File "/home/leelin/anaconda3/envs/mindspore/lib/python3.9/site-packages/mindspore/train/dataset_helper.py", line 294, in __init__

    self.iter = iterclass(dataset, sink_size, epoch_num)

  File "/home/leelin/anaconda3/envs/mindspore/lib/python3.9/site-packages/mindspore/train/dataset_helper.py", line 465, in __init__

    super().__init__(dataset, sink_size, epoch_num)

  File "/home/leelin/anaconda3/envs/mindspore/lib/python3.9/site-packages/mindspore/train/dataset_helper.py", line 387, in __init__

    _send_data(dataset, epoch_num)

  File "/home/leelin/anaconda3/envs/mindspore/lib/python3.9/site-packages/mindspore/train/dataset_helper.py", line 32, in _send_data

    exec_dataset.send(epoch_num)

  File "/home/leelin/anaconda3/envs/mindspore/lib/python3.9/site-packages/mindspore/dataset/engine/validators.py", line 1873, in new_method

    return method(self, *args, **kwargs)

  File "/home/leelin/anaconda3/envs/mindspore/lib/python3.9/site-packages/mindspore/dataset/engine/datasets.py", line 3499, in send

    self._to_device = _ToDevice(self, num_epochs)

  File "/home/leelin/anaconda3/envs/mindspore/lib/python3.9/site-packages/mindspore/dataset/engine/datasets.py", line 3389, in __init__

    ir_tree, self.api_tree = dataset.create_ir_tree()

  File "/home/leelin/anaconda3/envs/mindspore/lib/python3.9/site-packages/mindspore/dataset/engine/datasets.py", line 310, in create_ir_tree

    ir_tree = dataset.parse_tree()

  File "/home/leelin/anaconda3/envs/mindspore/lib/python3.9/site-packages/mindspore/dataset/engine/datasets.py", line 398, in parse_tree

    ir_children = [d.parse_tree() for d in self.children]

  File "/home/leelin/anaconda3/envs/mindspore/lib/python3.9/site-packages/mindspore/dataset/engine/datasets.py", line 398, in <listcomp>

    ir_children = [d.parse_tree() for d in self.children]

  File "/home/leelin/anaconda3/envs/mindspore/lib/python3.9/site-packages/mindspore/dataset/engine/datasets.py", line 397, in parse_tree

    raise ValueError("The data pipeline is not a tree (i.e., one node has 2 consumers)")

ValueError: The data pipeline is not a tree (i.e., one node has 2 consumers)

解答:

你好,这里忘了赋值了 建议改成 train_dataset = mds.GeneratorDataset(...) train_dataset = train_dataset.batch(...)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值