“batch_size”: 4, # 关键:显存限制下最大安全值
“input_shape”: [256, 256], # 降低计算负载
我并没有在opt文件中找到这两行,请结合我下述opt文件的全部内容,详细向我标出所有要修改的地方:
import torch.optim as optim
from run_utils.callbacks.base import (
AccumulateRawOutput,
PeriodicSaver,
ProcessAccumulatedRawOutput,
ScalarMovingAverage,
ScheduleLr,
TrackLr,
VisualizeOutput,
TriggerEngine,
)
from run_utils.callbacks.logging import LoggingEpochOutput, LoggingGradient
from run_utils.engine import Events
from .targets import gen_targets, prep_sample
from .net_desc import create_model
from .run_desc import proc_valid_step_output, train_step, valid_step, viz_step_output
# TODO: training config only ?
# TODO: switch all to function name String for all option
def get_config(nr_type, mode):
return {
# ------------------------------------------------------------------
# ! All phases have the same number of run engine
# phases are run sequentially from index 0 to N
"phase_list": [
{
"run_info": {
# may need more dynamic for each network
"net": {
"desc": lambda: create_model(
input_ch=3, nr_types=nr_type,
freeze=True, mode=mode
),
"optimizer": [
optim.Adam,
{ # should match keyword for parameters within the optimizer
"lr": 1.0e-4, # initial learning rate,
"betas": (0.9, 0.999),
},
],
# learning rate scheduler
"lr_scheduler": lambda x: optim.lr_scheduler.StepLR(x, 25),
"extra_info": {
"loss": {
"np": {"bce": 1, "dice": 1},
"hv": {"mse": 1, "msge": 1},
"tp": {"bce": 1, "dice": 1},
},
},
# path to load, -1 to auto load checkpoint from previous phase,
# None to start from scratch
"pretrained": "../pretrained/ImageNet-ResNet50-Preact_pytorch.tar",
# 'pretrained': None,
},
},
"target_info": {"gen": (gen_targets, {}), "viz": (prep_sample, {})},
"batch_size": {"train": 16, "valid": 16,}, # engine name : value
"nr_epochs": 50,
},
{
"run_info": {
# may need more dynamic for each network
"net": {
"desc": lambda: create_model(
input_ch=3, nr_types=nr_type,
freeze=False, mode=mode
),
"optimizer": [
optim.Adam,
{ # should match keyword for parameters within the optimizer
"lr": 1.0e-4, # initial learning rate,
"betas": (0.9, 0.999),
},
],
# learning rate scheduler
"lr_scheduler": lambda x: optim.lr_scheduler.StepLR(x, 25),
"extra_info": {
"loss": {
"np": {"bce": 1, "dice": 1},
"hv": {"mse": 1, "msge": 1},
"tp": {"bce": 1, "dice": 1},
},
},
# path to load, -1 to auto load checkpoint from previous phase,
# None to start from scratch
"pretrained": -1,
},
},
"target_info": {"gen": (gen_targets, {}), "viz": (prep_sample, {})},
"batch_size": {"train": 4, "valid": 8,}, # batch size per gpu
"nr_epochs": 50,
},
],
# ------------------------------------------------------------------
# TODO: dynamically for dataset plugin selection and processing also?
# all enclosed engine shares the same neural networks
# as the on at the outer calling it
"run_engine": {
"train": {
# TODO: align here, file path or what? what about CV?
"dataset": "", # whats about compound dataset ?
"nr_procs": 16, # number of threads for dataloader
"run_step": train_step, # TODO: function name or function variable ?
"reset_per_run": False,
# callbacks are run according to the list order of the event
"callbacks": {
Events.STEP_COMPLETED: [
# LoggingGradient(), # TODO: very slow, may be due to back forth of tensor/numpy ?
ScalarMovingAverage(),
],
Events.EPOCH_COMPLETED: [
TrackLr(),
PeriodicSaver(),
VisualizeOutput(viz_step_output),
LoggingEpochOutput(),
TriggerEngine("valid"),
ScheduleLr(),
],
},
},
"valid": {
"dataset": "", # whats about compound dataset ?
"nr_procs": 8, # number of threads for dataloader
"run_step": valid_step,
"reset_per_run": True, # * to stop aggregating output etc. from last run
# callbacks are run according to the list order of the event
"callbacks": {
Events.STEP_COMPLETED: [AccumulateRawOutput(),],
Events.EPOCH_COMPLETED: [
# TODO: is there way to preload these ?
ProcessAccumulatedRawOutput(
lambda a: proc_valid_step_output(a, nr_types=nr_type)
),
LoggingEpochOutput(),
],
},
},
},
}