使用mindspore训练DKT知识追踪模型,模型是LSTM,没报错,只训练了一个Epoch:0 ,就直接终止

问题描述

使用mindspore训练DKT知识追踪模型,模型是LSTM,没报错,只训练了一个Epoch:0    ,就直接终止,显示Process finished with exit code -1073741819 (0xC0000005).数据量很小,有3800多条数据,下面是运行截图.我使用CPU跑的,CPU是AMD R7-5800H

image.png

import os
import time
from argparse import Namespace

import numpy as np
from mindspore import load_param_into_net, load_checkpoint, save_checkpoint
from mindspore.dataset import GeneratorDataset
from mindspore.nn import Adam, BCELoss, PolynomialDecayLR
from tqdm import tqdm
from mindspore import ops
from KTScripts.BackModels import nll_loss
from KTScripts.DataLoader import KTDataset, RecDataset, RetrievalDataset
from KTScripts.PredictModel import ModelWithLoss, ModelWithLossMask, ModelWithOptimizer
from KTScripts.utils import set_random_seed, load_model, evaluate_utils


def main(args: Namespace):

    set_random_seed(args.rand_seed)
    dataset = RecDataset if args.forRec else (RetrievalDataset if args.retrieval else KTDataset)
    dataset = dataset(os.path.join(args.data_dir, args.dataset))
    args.feat_nums, args.user_nums = dataset.feats_num, dataset.users_num
    print(args)
    if args.retrieval:
        dataset = GeneratorDataset(source=dataset,
                                   column_names=['intra_x', 'inter_his', 'inter_r', 'y', 'mask', 'inter_len'],
                                   shuffle=False, num_parallel_workers=8, python_multiprocessing=False)
        dataset = dataset.batch(args.batch_size, num_parallel_workers=1)
        train_data, test_data = dataset.split([0.8, 0.2], randomize=False)
    else:
        dataset = GeneratorDataset(source=dataset, column_names=['x', 'y', 'mask'], shuffle=True)
        dataset = dataset.batch(args.batch_size, num_parallel_workers=1)
        train_data, test_data = dataset.split([0.8, 0.2], randomize=True)
    # for data_ in tqdm(dataset.create_tuple_iterator(), total=dataset.get_dataset_size()):
    #     ops.print_(data_)
    if args.forRec:
        args.output_size = args.feat_nums
    # Model
    model = load_model(args)
    model_path = os.path.join(args.save_dir, args.exp_name)
    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)
    if args.load_model:
        load_param_into_net(model, load_checkpoint(f'{model_path}.ckpt'))
        print(f"Load Model From {model_path}")
    # Optimizer
    polynomial_decay_lr = PolynomialDecayLR(learning_rate=args.lr,
                                            end_learning_rate=1e-5,
                                            decay_steps=train_data.get_dataset_size() // 10 + 1,
                                            power=0.5,
                                            update_decay_steps=True)
    optimizer = Adam(model.trainable_params(), learning_rate=polynomial_decay_lr, weight_decay=args.l2_reg)
    if args.forRec:
        model_with_loss = ModelWithLossMask(model, nll_loss)
    else:
        model_with_loss = ModelWithLoss(model, BCELoss(reduction='mean'))
    model_train = ModelWithOptimizer(model_with_loss, optimizer, args.forRec)
    best_val_auc = 0
    train_total, test_total = train_data.get_dataset_size(), test_data.get_dataset_size()
    print('-' * 20 + "Training Start" + '-' * 20)
    for epoch in range(args.num_epochs):
        avg_time = 0
        model_train.set_train()
        for i, data in tqdm(enumerate(train_data.create_tuple_iterator()), total=train_total):
            t0 = time.perf_counter()
            loss, output_data = model_train(*data)
            # print(data)
            loss = loss.asnumpy()
            acc, auc = evaluate_utils(*output_data)
            avg_time += time.perf_counter() - t0
            print('Epoch:{}\tbatch:{}\tavg_time:{:.4f}\tloss:{:.4f}\tacc:{:.4f}\tauc:{:.4f}'
                  .format(epoch, i, avg_time / (i + 1), loss, acc, auc))
        print('-' * 20 + "Validating Start" + '-' * 20)
        val_eval = [[], []]
        loss_total, data_total = 0, 0
        model_with_loss.set_train(False)
        for data in tqdm(test_data.create_tuple_iterator(), total=test_total):
            loss, output_data = model_with_loss.output(*data)
            val_eval[0].append(output_data[0].asnumpy())
            val_eval[1].append(output_data[1].asnumpy())
            loss_total += loss.asnumpy() * len(data[0])
            data_total += len(data[0])
        val_eval = [np.concatenate(_) for _ in val_eval]
        acc, auc = evaluate_utils(*val_eval)
        print(f"Validating loss:{loss_total / data_total:.4f} acc:{acc:.4f} auc:{auc:.4f}")
        if auc >= best_val_auc:
            best_val_auc = auc
            save_checkpoint(model, model_path)
            print("New best result Saved!")
        print(f"Best Auc Now:{best_val_auc:.4f}")

    print('-' * 20 + "Testing Start" + '-' * 20)
    val_eval = [[], []]
    loss_total, data_total = 0, 0
    model_with_loss.set_train(False)
    for data in tqdm(test_data.create_tuple_iterator(), total=test_total):
        loss, output_data = model_with_loss.output(*data)
        val_eval[0].append(output_data[0].asnumpy())
        val_eval[1].append(output_data[1].asnumpy())
        loss_total += loss.asnumpy() * len(data[0])
        data_total += len(data[0])
    val_eval = [np.concatenate(_) for _ in val_eval]
    print(val_eval[0], val_eval[0].mean())
    print(val_eval[1], val_eval[1].mean())
    acc, auc = evaluate_utils(*val_eval)
    print(f"Testing loss:{loss_total / data_total:.4f} acc:{acc:.4f} auc:{auc:.4f}")


if __name__ == '__main__':
    from argparse import ArgumentParser
    from KTScripts.options import get_options

    parser = ArgumentParser("LearningPath-Planing")
    args_ = get_options(parser)
    # print(args_)
    main(args_)

解决方案

这个不是MindSpore框架层的报错,请参考https://blog.csdn.net/qiao1025566574/article/details/81037908

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值