# 安装nvitop
sudo apt install nvitop #安装nvitop
# 安装成功后直接启动监控
nvitop
上图为展示界面with option --colorfull
nvitop -h #查看参数介绍
optional arguments:
--help, -h Show this help message and exit.
--version, -V Show nvitop's version number and exit.
--once, -1 Report query data only once.
--monitor [{auto,full,compact}], -m [{auto,full,compact}]
Run as a resource monitor. Continuously report query data and handle user inputs.
If the argument is omitted, the value from `NVITOP_MONITOR_MODE` will be used.
(default fallback mode: auto)
--interval SEC Process status update interval in seconds. (default: 2)
--ascii, --no-unicode, -U
Use ASCII characters only, which is useful for terminals without Unicode support.
coloring:
--colorful Use gradient colors to get spectrum-like bar charts. This option is only available
when the terminal supports 256 colors. You may need to set environment variable
`TERM="xterm-256color"`. Note that the terminal multiplexer, such as `tmux`, may
override the `TREM` variable.
--force-color Force colorize even when `stdout` is not a TTY terminal.
--light Tweak visual results for light theme terminals in monitor mode.
Set variable `NVITOP_MONITOR_MODE="light"` on light terminals for convenience.
--gpu-util-thresh th1 th2
Thresholds of GPU utilization to determine the load intensity.
Coloring rules: light < th1 % <= moderate < th2 % <= heavy.
( 1 <= th1 < th2 <= 99, defaults: 10 75 )
--mem-util-thresh th1 th2
Thresholds of GPU memory percent to determine the load intensity.
Coloring rules: light < th1 % <= moderate < th2 % <= heavy.
( 1 <= th1 < th2 <= 99, defaults: 10 80 )
device filtering:
--only idx [idx ...], -o idx [idx ...]
Only show the specified devices, suppress option `--only-visible`.
--only-visible, -ov Only show devices in the `CUDA_VISIBLE_DEVICES` environment variable.
process filtering:
--compute, -c Only show GPU processes with the compute context. (type: 'C' or 'C+G')
--only-compute, -C Only show GPU processes exactly with the compute context. (type: 'C' only)
--graphics, -g Only show GPU processes with the graphics context. (type: 'G' or 'C+G')
--only-graphics, -G Only show GPU processes exactly with the graphics context. (type: 'G' only)
--user [USERNAME [USERNAME ...]], -u [USERNAME [USERNAME ...]]
Only show processes of the given users (or `$USER` for no argument).
--pid PID [PID ...], -p PID [PID ...]
Only show processes of the given PIDs.
# 如果出现importError,缺少nvidia-ml-py3库报错,可以参考以下解决方法
pip3 freeze # 查看当前库
pip3 install --force-reinstall nvidia-ml-py==11.450.51
#若仍然报错,参考以下方法
pip3 uninstall nvidia-ml-py3 pynvml
pip3 install --force-reinstall nvidia-ml-py==11.450.51
# 将监控信息写入Tensorboard
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from nvitop import CudaDevice, ResourceMetricCollector
from nvitop.callbacks.tensorboard import add_scalar_dict
# Build networks and prepare datasets
...
# Logger and status collector
writer = SummaryWriter()
collector = ResourceMetricCollector(devices=CudaDevice.all(), # log all visible CUDA devices and use the CUDA ordinal
root_pids={os.getpid()}, # only log the descendant processes of the current process
interval=1.0) # snapshot interval for background daemon thread
# Start training
global_step = 0
for epoch in range(num_epoch):
with collector(tag='train'):
for batch in train_dataset:
with collector(tag='batch'):
metrics = train(net, batch)
global_step += 1
add_scalar_dict(writer, 'train', metrics, global_step=global_step)
add_scalar_dict(writer, 'resources', # tag='resources/train/batch/...'
collector.collect(),
global_step=global_step)
add_scalar_dict(writer, 'resources', # tag='resources/train/...'
collector.collect(),
global_step=epoch)
with collector(tag='validate'):
metrics = validate(net, validation_dataset)
add_scalar_dict(writer, 'validate', metrics, global_step=epoch)
add_scalar_dict(writer, 'resources', # tag='resources/validate/...'
collector.collect(),
global_step=epoch)