关键词:python
、epoch
、loss
、log
🤖: 记录模型的训练过程的步骤如下:
- 导入logging模块。
- 配置日志记录器,设置日志文件名、日志级别、日志格式等。
- 在每个epoch结束时,使用logging模块记录性能指标、损失值、准确率等信息。
- 在训练过程中,记录其他重要信息,比如学习率的变化、每个batch的损失值等。
- 日志记录的信息可以帮助你更好地理解模型的训练过程,以及在后续分析和调试中提供有用的信息。
一、定义logger类
1. util里定义Logger
class Logger(object):
"""
Write console output to external text file.
Code imported from https://github.com/Cysu/open-reid/blob/master/reid/utils/logging.py.
"""
def __init__(self, fpath=None):
self.console = sys.stdout
self.file = None
if fpath is not None:
self.file = open(fpath, 'a')
def __del__(self):
self.close()
def __enter__(self):
pass
def __exit__(self, *args):
self.close()
def write(self, msg):
self.console.write(msg+'\n')
if self.file is not None:
self.file.write(msg+'\n')
def flush(self):
self.console.flush()
if self.file is not None:
self.file.flush()
os.fsync(self.file.fileno())
def close(self):
self.console.close()
if self.file is not None:
self.file.close()
2. train里调用
log_path = pjoin('./result', 'train', args.city, f'{args.tinterval}')
logger = util.Logger(pjoin(log_path, 'test.log'))
logger.write(f'\nTesting configs: {args}')
# use tensorboard to draw the curves.
train_writer = SummaryWriter(pjoin('./result', 'train', args.city, f'{args.tinterval}'))
val_writer = SummaryWriter(pjoin('./result', 'val', args.city, f'{args.tinterval}'))
logger.write(“文本提示”)
logger.write("start training...")
- best id
- loss
{变量: 格式d/f}
- 占位符:03d是一个格式化字符串,其中的0表示用0来填充空位,3表示总共占据3位,d表示这是一个十进制整数。因此,当i的值小于100时,会用0来填充,确保输出的字符串总共占据3位。
if i%args.print_every == 0:
logger.write(f'Epoch: {i:03d}, MAE: {mtrain_mae:.2f}, RMSE: {mtrain_rmse:.2f}, MAPE: {mtrain_mape:.2f}, Valid MAE: {mvalid_mae:.2f}, RMSE: {mvalid_rmse:.2f}, MAPE: {mvalid_mape:.2f}')
torch.save(engine.model.state_dict(), save_path+"_epoch_"+str(i)+"_"+str(round(mvalid_mae,2))+".pth")
logger.write("Average Training Time: {:.4f} secs/epoch".format(np.mean(train_time)))
bestid = np.argmin(his_loss)
engine.model.load_state_dict(torch.load(save_path+"_epoch_"+str(bestid+1)+"_"+str(round(his_loss[bestid],2))+".pth"))
logger.write("Training finished")
logger.write(f"The valid loss on best model is {str(round(his_loss[bestid],4))}")
二、自定义print log
def print_log(*values, log=None, end="\n"):
print(*values, end=end)
if log:
if isinstance(log, str):
log = open(log, "a")
print(*values, file=log, end=end)
log.flush()
1. 初始化日志文件
- 记录时间
- 保存路径
- 文件名称
# ------------------------------- make log file ------------------------------ #
now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
log_path = f"../logs/"
if not os.path.exists(log_path):
os.makedirs(log_path)
log = os.path.join(log_path, f"{model_name}-{dataset}-{now}.log")
log = open(log, "a")
log.seek(0)
log.truncate()
2. 模型记录epoch
# --------------------------- train and test model --------------------------- #
print_log(f"Loss: {criterion._get_name()}", log=log)
print_log(log=log)
model = train(
model,
trainset_loader,
valset_loader,
optimizer,
scheduler,
criterion,
clip_grad=cfg.get("clip_grad"),
max_epochs=cfg.get("max_epochs", 200),
early_stop=cfg.get("early_stop", 10),
verbose=1,
log=log,
save=save,
)
print_log(f"Saved Model: {save}", log=log)
test_model(model, testset_loader, log=log)
log.close()
3. 在每次调用的模型函数(train\test)里面保存需要的内容
train(arg, log = log)
将定义的log传入模型训练函数
out_str = f"Early stopping at epoch: {epoch+1}\n"
out_str += f"Best at epoch {best_epoch+1}:\n"
out_str += "Train Loss = %.5f\n" % train_loss_list[best_epoch]
out_str += "Train RMSE = %.5f, MAE = %.5f, MAPE = %.5f\n" % (
train_rmse,
train_mae,
train_mape,
)
out_str += "Val Loss = %.5f\n" % val_loss_list[best_epoch]
out_str += "Val RMSE = %.5f, MAE = %.5f, MAPE = %.5f" % (
val_rmse,
val_mae,
val_mape,
)
print_log(out_str, log=log)
print_log(需要保存的值, log = 定义的log)
4. log内容编辑
- f-strings 是指以f或F 开头的字符串,其中以 {}包含的表达式会进行值替换
- 在字符串前加r可防止字符串转义
- “文本提示字符串 : {
属性值
} 换行\n
” %d
、%f
for i in range(out_steps):
rmse, mae, mape = RMSE_MAE_MAPE(y_true[:, i, :], y_pred[:, i, :])
out_str += "Step %d RMSE = %.5f, MAE = %.5f, MAPE = %.5f\n" % (
i + 1,
rmse,
mae,
mape,
)
5. 效果展示
三、 自定义logger
1. 函数定义
def get_logger(config, name=None):
log_dir = './libcity/log'
if not os.path.exists(log_dir):
os.makedirs(log_dir)
log_filename = '{}-{}-{}-{}.log'.format(config['exp_id'],
config['model'], config['dataset'], get_local_time())
logfilepath = os.path.join(log_dir, log_filename)
logger = logging.getLogger(name)
log_level = config.get('log_level', 'INFO')
if log_level.lower() == 'info':
level = logging.INFO
elif log_level.lower() == 'debug':
level = logging.DEBUG
elif log_level.lower() == 'error':
level = logging.ERROR
elif log_level.lower() == 'warning':
level = logging.WARNING
elif log_level.lower() == 'critical':
level = logging.CRITICAL
else:
level = logging.INFO
logger.setLevel(level)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
file_handler = logging.FileHandler(logfilepath)
file_handler.setFormatter(formatter)
console_formatter = logging.Formatter(
'%(asctime)s - %(levelname)s - %(message)s')
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setFormatter(console_formatter)
logger.addHandler(file_handler)
logger.addHandler(console_handler)
logger.info('Log directory: %s', log_dir)
return logger
2. 函数调用
logger = get_logger(config)
logger.info('Begin pipeline, task={}, model_name={}, dataset_name={}, exp_id={}'.
format(str(task), str(model_name), str(dataset_name), str(exp_id)))
logger.info(config.config)
best_trial = result.get_best_trial("loss", "min", "last")
logger.info("Best trial config: {}".format(best_trial.config))
logger.info("Best trial final validation loss: {}".format(best_trial.last_result["loss"]))
四、自定义log_string()
1. utils.py
# log string
def log_string(log, string):
log.write(string + '\n')
log.flush()
print(string)
2. main.py
import上面的utils
- 定义路径
- 打开log
- 写入log
parser.add_argument('--log_file', default='./data/log',
help='log file')
args = parser.parse_args()
log = open(log_file, 'w')
# load data
log_string(log, 'loading data...')
- 记入时间:
%.1fmin
一位小数字符串,min分钟单位 - 调用变量值:%
- 或者字符串里:
{:}
.format(变量)
if __name__ == '__main__':
start = time.time()
loss_train, loss_val = train(model, args, log, loss_criterion, optimizer, scheduler)
plot_train_val_loss(loss_train, loss_val, 'figure/train_val_loss.png')
trainPred, valPred, testPred = test(args, log)
end = time.time()
log_string(log, 'total time: %.1fmin' % ((end - start) / 60))