通过xml可以直观的解析GPU信息与进程信息

1 篇文章 0 订阅

Linux中运行nvidia-smi命令直接输出xml格式文本,通过xml可以直观的解析GPU信息与进程信息。

<?xml version="1.0" ?>
<!DOCTYPE nvidia_smi_log SYSTEM "nvsmi_device_v10.dtd">
<nvidia_smi_log>
        <timestamp>Wed Jun  9 16:52:52 2021</timestamp>
        <driver_version>410.48</driver_version>
        <attached_gpus>1</attached_gpus>
        <gpu id="00000000:02:00.0">
                <product_name>GeForce GTX 1080 Ti</product_name>
                <product_brand>GeForce</product_brand>
                <display_mode>Disabled</display_mode>
                <display_active>Disabled</display_active>
                <persistence_mode>Disabled</persistence_mode>
                <accounting_mode>Disabled</accounting_mode>
                <accounting_mode_buffer_size>4000</accounting_mode_buffer_size>
                <driver_model>
                        <current_dm>N/A</current_dm>
                        <pending_dm>N/A</pending_dm>
                </driver_model>
                <serial>0330617032375</serial>
                <uuid>GPU-32c5cd8b-a5f4-538a-13a4-91cbd9410f3e</uuid>
                <minor_number>0</minor_number>
                <vbios_version>86.02.39.00.01</vbios_version>
                <multigpu_board>No</multigpu_board>
                <board_id>0x200</board_id>
                <gpu_part_number>900-1G611-2550-000</gpu_part_number>
                <inforom_version>
                        <img_version>G001.0000.01.04</img_version>
                        <oem_object>1.1</oem_object>
                        <ecc_object>N/A</ecc_object>
                        <pwr_object>N/A</pwr_object>
                </inforom_version>
                <gpu_operation_mode>
                        <current_gom>N/A</current_gom>
                        <pending_gom>N/A</pending_gom>
                </gpu_operation_mode>
                <gpu_virtualization_mode>
                        <virtualization_mode>None</virtualization_mode>
                </gpu_virtualization_mode>
                <ibmnpu>
                        <relaxed_ordering_mode>N/A</relaxed_ordering_mode>
                </ibmnpu>
                <pci>
                        <pci_bus>02</pci_bus>
                        <pci_device>00</pci_device>
                        <pci_domain>0000</pci_domain>
                        <pci_device_id>1B0610DE</pci_device_id>
                        <pci_bus_id>00000000:02:00.0</pci_bus_id>
                        <pci_sub_system_id>120F10DE</pci_sub_system_id>
                        <pci_gpu_link_info>
                                <pcie_gen>
                                        <max_link_gen>3</max_link_gen>
                                        <current_link_gen>1</current_link_gen>
                                </pcie_gen>
                                <link_widths>
                                        <max_link_width>16x</max_link_width>
                                        <current_link_width>16x</current_link_width>
                                </link_widths>
                        </pci_gpu_link_info>
                        <pci_bridge_chip>
                                <bridge_chip_type>N/A</bridge_chip_type>
                                <bridge_chip_fw>N/A</bridge_chip_fw>
                        </pci_bridge_chip>
                        <replay_counter>0</replay_counter>
                        <tx_util>0 KB/s</tx_util>
                        <rx_util>0 KB/s</rx_util>
                </pci>
                <fan_speed>26 %</fan_speed>
                <performance_state>P8</performance_state>
                <clocks_throttle_reasons>
                        <clocks_throttle_reason_gpu_idle>Active</clocks_throttle_reason_gpu_idle>
                        <clocks_throttle_reason_applications_clocks_setting>Not Active</clocks_throttle_reason_applications_clocks_setting>
                        <clocks_throttle_reason_sw_power_cap>Not Active</clocks_throttle_reason_sw_power_cap>
                        <clocks_throttle_reason_hw_slowdown>Not Active</clocks_throttle_reason_hw_slowdown>
                        <clocks_throttle_reason_hw_thermal_slowdown>Not Active</clocks_throttle_reason_hw_thermal_slowdown>
                        <clocks_throttle_reason_hw_power_brake_slowdown>Not Active</clocks_throttle_reason_hw_power_brake_slowdown>
                        <clocks_throttle_reason_sync_boost>Not Active</clocks_throttle_reason_sync_boost>
                        <clocks_throttle_reason_sw_thermal_slowdown>Not Active</clocks_throttle_reason_sw_thermal_slowdown>
                        <clocks_throttle_reason_display_clocks_setting>Not Active</clocks_throttle_reason_display_clocks_setting>
                </clocks_throttle_reasons>
                <fb_memory_usage>
                        <total>11177 MiB</total>
                        <used>1945 MiB</used>
                        <free>9232 MiB</free>
                </fb_memory_usage>
                <bar1_memory_usage>
                        <total>256 MiB</total>
                        <used>2 MiB</used>
                        <free>254 MiB</free>
                </bar1_memory_usage>
                <compute_mode>Default</compute_mode>
                <utilization>
                        <gpu_util>0 %</gpu_util>
                        <memory_util>0 %</memory_util>
                        <encoder_util>0 %</encoder_util>
                        <decoder_util>0 %</decoder_util>
                </utilization>
                <encoder_stats>
                        <session_count>0</session_count>
                        <average_fps>0</average_fps>
                        <average_latency>0</average_latency>
                </encoder_stats>
                <fbc_stats>
                        <session_count>0</session_count>
                        <average_fps>0</average_fps>
                        <average_latency>0</average_latency>
                </fbc_stats>
                <ecc_mode>
                        <current_ecc>N/A</current_ecc>
                        <pending_ecc>N/A</pending_ecc>
                </ecc_mode>
                <ecc_errors>
                        <volatile>
                                <single_bit>
                                        <device_memory>N/A</device_memory>
                                        <register_file>N/A</register_file>
                                        <l1_cache>N/A</l1_cache>
                                        <l2_cache>N/A</l2_cache>
                                        <texture_memory>N/A</texture_memory>
                                        <texture_shm>N/A</texture_shm>
                                        <cbu>N/A</cbu>
                                        <total>N/A</total>
                                </single_bit>
                                <double_bit>
                                        <device_memory>N/A</device_memory>
                                        <register_file>N/A</register_file>
                                        <l1_cache>N/A</l1_cache>
                                        <l2_cache>N/A</l2_cache>
                                        <texture_memory>N/A</texture_memory>
                                        <texture_shm>N/A</texture_shm>
                                        <cbu>N/A</cbu>
                                        <total>N/A</total>
                                </double_bit>
                        </volatile>
                        <aggregate>
                                <single_bit>
                                        <device_memory>N/A</device_memory>
                                        <register_file>N/A</register_file>
                                        <l1_cache>N/A</l1_cache>
                                        <l2_cache>N/A</l2_cache>
                                        <texture_memory>N/A</texture_memory>
                                        <texture_shm>N/A</texture_shm>
                                        <cbu>N/A</cbu>
                                        <total>N/A</total>
                                </single_bit>
                                <double_bit>
                                        <device_memory>N/A</device_memory>
                                        <register_file>N/A</register_file>
                                        <l1_cache>N/A</l1_cache>
                                        <l2_cache>N/A</l2_cache>
                                        <texture_memory>N/A</texture_memory>
                                        <texture_shm>N/A</texture_shm>
                                        <cbu>N/A</cbu>
                                        <total>N/A</total>
                                </double_bit>
                        </aggregate>
                </ecc_errors>
                <retired_pages>
                        <multiple_single_bit_retirement>
                                <retired_count>N/A</retired_count>
                                <retired_pagelist>N/A</retired_pagelist>
                        </multiple_single_bit_retirement>
                        <double_bit_retirement>
                                <retired_count>N/A</retired_count>
                                <retired_pagelist>N/A</retired_pagelist>
                        </double_bit_retirement>
                        <pending_retirement>N/A</pending_retirement>
                </retired_pages>
                <temperature>
                        <gpu_temp>37 C</gpu_temp>
                        <gpu_temp_max_threshold>96 C</gpu_temp_max_threshold>
                        <gpu_temp_slow_threshold>93 C</gpu_temp_slow_threshold>
                        <gpu_temp_max_gpu_threshold>N/A</gpu_temp_max_gpu_threshold>
                        <memory_temp>N/A</memory_temp>
                        <gpu_temp_max_mem_threshold>N/A</gpu_temp_max_mem_threshold>
                </temperature>
                <power_readings>
                        <power_state>P8</power_state>
                        <power_management>Supported</power_management>
                        <power_draw>10.56 W</power_draw>
                        <power_limit>250.00 W</power_limit>
                        <default_power_limit>250.00 W</default_power_limit>
                        <enforced_power_limit>250.00 W</enforced_power_limit>
                        <min_power_limit>125.00 W</min_power_limit>
                        <max_power_limit>300.00 W</max_power_limit>
                </power_readings>
                <clocks>
                        <graphics_clock>139 MHz</graphics_clock>
                        <sm_clock>139 MHz</sm_clock>
                        <mem_clock>405 MHz</mem_clock>
                        <video_clock>544 MHz</video_clock>
                </clocks>
                <applications_clocks>
                        <graphics_clock>N/A</graphics_clock>
                        <mem_clock>N/A</mem_clock>
                </applications_clocks>
                <default_applications_clocks>
                        <graphics_clock>N/A</graphics_clock>
                        <mem_clock>N/A</mem_clock>
                </default_applications_clocks>
                <max_clocks>
                        <graphics_clock>1911 MHz</graphics_clock>
                        <sm_clock>1911 MHz</sm_clock>
                        <mem_clock>5505 MHz</mem_clock>
                        <video_clock>1620 MHz</video_clock>
                </max_clocks>
                <max_customer_boost_clocks>
                        <graphics_clock>N/A</graphics_clock>
                </max_customer_boost_clocks>
                <clock_policy>
                        <auto_boost>N/A</auto_boost>
                        <auto_boost_default>N/A</auto_boost_default>
                </clock_policy>
                <supported_clocks>N/A</supported_clocks>
                <processes>
                        <process_info>
                                <pid>10500</pid>
                                <type>C</type>
                                <process_name>/home/anaconda/anaconda3/bin/python</process_name>
                                <used_memory>1935 MiB</used_memory>
                        </process_info>
                </processes>
                <accounted_processes>
                </accounted_processes>
        </gpu>

</nvidia_smi_log>

java解析xml

    import com.jcraft.jsch.Session;
    import org.dom4j.Document;
    import org.dom4j.DocumentException;
    import org.dom4j.DocumentHelper;
    import org.dom4j.Element;
    
   
    /**
     * 获取gpu信息(暂时只支持nvidia-smi)
     *
     * @return gpu信息集合
     * @throws DocumentException xml解析错误
     */
    private List<GPUInfo> convertXmlToGpuObject() throws DocumentException {
        //连接服务器
        Session connect = RemoteServerUtil.connect(ip, username, password);
        //运行"nvidia-smi -q -x"命令获取GPU的xml字符串
        String xmlGpu = RemoteServerUtil.execCommand(connect, "nvidia-smi -q -x");
        //忽略dtd
        xmlGpu = xmlGpu.replaceAll("<!DOCTYPE.*.dtd\">", "");
        Document document = DocumentHelper.parseText(xmlGpu);
        List<Element> gpu = document.getRootElement().elements("gpu");
        List<GPUInfo> gpuInfoList = new ArrayList<>();
        gpu.forEach(element -> {
            //名称
            String uuid = element.element("uuid").getText();
            //GPU内存信息
            Element fbMemoryUsage = element.element("fb_memory_usage");
            //总内存
            String total = fbMemoryUsage.element("total").getText();
            //已用内存
            String used = fbMemoryUsage.element("used").getText();
            //空闲内存
            String free = fbMemoryUsage.element("free").getText();
            //进程信息
            Element processes = element.element("processes");
            List<Element> infos = processes.elements("process_info");
            List<ProcessInfo> processInfos = new ArrayList<>();
            infos.forEach(info -> {
                String pid = info.element("pid").getText();
                String name = info.element("process_name").getText();
                String usedMemory = info.element("used_memory").getText();
                ProcessInfo processInfo = new ProcessInfo();
                processInfo.setPid(pid);
                processInfo.setName(name);
                processInfo.setUsedMemory(usedMemory);
                processInfos.add(processInfo);
            });
            //使用率
            int intTotal = Integer.parseInt(total.split(" ")[0]);
            int intUsed = Integer.parseInt(used.split(" ")[0]);
            //实例化对象
            GPUInfo gpuInfo = new GPUInfo();
            gpuInfo.setName(uuid);
            gpuInfo.setTotalMemory(total);
            gpuInfo.setUsedMemory(used);
            gpuInfo.setFreeMemory(free);
            gpuInfo.setUsageRate((int) ((float) intUsed / intTotal * 100));
            gpuInfo.setProcessInfos(processInfos);
            gpuInfoList.add(gpuInfo);
        });
        return gpuInfoList;
    }

GPU基本信息实体类

import lombok.Data;
import java.util.List;

@Data
public class GPUInfo {
    //名称
    private String name;
    //总内存
    private String totalMemory;
    //已用内存
    private String usedMemory;
    //空闲内存
    private String freeMemory;
    //使用率(整型,最大为100)
    private int usageRate;
    //进程信息
    private List<ProcessInfo> processInfos;
}

进程信息实体类

import lombok.Data;

@Data
public class ProcessInfo {
    private String pid;
    private String name;
    private String usedMemory;
}

连接服务器工具类

import com.jcraft.jsch.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;

public class RemoteServerUtil {

    private static final Logger logger = LoggerFactory.getLogger(RemoteServerUtil.class);

//    public static final String CPU_MEM_SHELL = "top -b -n 1";
    public static final String CPU_MEM_SHELL = "cat /proc/cpuinfo";
//    public static final String FILES_SHELL = "df -hl";
//    public static final String[] COMMANDS = {CPU_MEM_SHELL, FILES_SHELL};
    public static final String[] COMMANDS = {CPU_MEM_SHELL};
    public static final String LINE_SEPARATOR = System.getProperty("line.separator");

    /**
     * 连接到指定的HOST
     *
     * @return isConnect
     * @throws JSchException JSchException
     */
    public static Session connect(String ip, String username, String password) {
//        logger.info("正在连接{}, 用户名为{}, 密码为{}...", ip, username, password);
        JSch jsch = new JSch();
        Session session = null;
        try {
            session = jsch.getSession(username, ip, 22);
            session.setPassword(password);
            java.util.Properties config = new java.util.Properties();
            config.put("StrictHostKeyChecking", "no");
            session.setConfig(config);
            session.connect();
        } catch (JSchException e) {
            e.printStackTrace();
            throw new RuntimeException("连接失败...");
        }
//        logger.info("已经成功连接{}...", session.getHost());
        return session;
    }

    public static Session connect(String ip,Integer port) {
        //        logger.info("正在连接{}, 用户名为{}, 密码为{}...", ip, username, password);
        JSch jsch = new JSch();
        Session session = null;
        try {
            session = jsch.getSession("root", ip, port);
            session.setPassword("123456");
            java.util.Properties config = new java.util.Properties();
            config.put("StrictHostKeyChecking", "no");
            session.setConfig(config);
            session.connect();
        } catch (JSchException e) {
            e.printStackTrace();
            throw new RuntimeException("请稍后,后台编译环境正在启动...");
        }
//        logger.info("已经成功连接{}...", session.getHost());
        return session;
    }

    // 带超时时间限制的shell命令执行方法
    public static String execCommand(Session session, String command,int timeout) {
        //        logger.info("开始在远程服务器执行命令: {}...", command);
        StringBuffer res = new StringBuffer();
        Channel channel = null;
        InputStream in = null;
        InputStream er = null;
        BufferedReader reader = null;
        BufferedReader errorReader = null;
        long timeMillis = timeout * 60000;  // 分钟换算毫秒
        try {
            channel = session.openChannel("exec");
            ((ChannelExec) channel).setCommand(command);
            channel.setInputStream(null);
            ((ChannelExec) channel).setErrStream(System.err);
            channel.connect();
            in = channel.getInputStream();
            reader = new BufferedReader(new InputStreamReader(in));
            er = ((ChannelExec) channel).getErrStream();
            errorReader = new BufferedReader(new InputStreamReader(er));

            long start = System.currentTimeMillis();
            boolean processFinished = false;
            String line;
            while (System.currentTimeMillis() - start < timeMillis) {
                if (channel.getExitStatus() > -1) {
                    processFinished = true;
                    break;
                }
                Thread.sleep(1000);
            }

            if (processFinished) {
                while ((line = reader.readLine()) != null) {
//                logger.info(line);
                    res.append(line + "\n");
                }
                String errbuf;
                while ((errbuf = errorReader.readLine()) != null) {
                    res.append(errbuf + "\n");
                }
            } else {
                res.append("Command process timeout");
            }
//            logger.info("远程服务器命令{}执行完成,正在断开连接...", command);
        } catch (IOException | JSchException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        } finally {
            try {
                if (reader != null) reader.close();
                if (in != null) in.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
            if (channel != null) channel.disconnect();
        }
        return res.toString();
    }

    public static String execCommand(Session session, String command) {
//        logger.info("开始在远程服务器执行命令: {}...", command);
        StringBuffer sb = new StringBuffer();
        Channel channel = null;
        InputStream in = null;
        BufferedReader reader = null;
        try {
            channel = session.openChannel("exec");
            ((ChannelExec) channel).setCommand(command);
            channel.setInputStream(null);
            ((ChannelExec) channel).setErrStream(System.err);
            channel.connect();
            in = channel.getInputStream();
            reader = new BufferedReader(new InputStreamReader(in));
            String line;
            while ((line = reader.readLine()) != null) {
//                logger.info(line);
                sb.append(line + "\n");
            }
//            logger.info("远程服务器命令{}执行完成,正在断开连接...", command);
        } catch (IOException | JSchException e) {
            e.printStackTrace();
        } finally {
            try {
                if (reader != null) reader.close();
                if (in != null) in.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
            if (channel != null) channel.disconnect();
        }
        return sb.toString();
    }

    /**
     * Filesystem            Size  Used Avail Use% Mounted on
     * /dev/sda3             442G  327G   93G  78% /
     * tmpfs                  32G     0   32G   0% /dev/shm
     * /dev/sda1             788M   60M  689M   8% /boot
     * /dev/md0              1.9T  483G  1.4T  26% /ezsonar
     *
     * @param commandResult 处理系统磁盘状态shell执行结果
     * @return 处理后的结果
     */
    private static String disposeFilesSystem(String commandResult) {
        String[] strings = commandResult.split(LINE_SEPARATOR);

        // final String PATTERN_TEMPLATE = "([a-zA-Z0-9%_/]*)\\s";
        int size = 0;
        int used = 0;
        for (int i = 0; i < strings.length - 1; i++) {
            if (i == 0) continue;

            int temp = 0;
            for (String s : strings[i].split("\\b")) {
                if (temp == 0) {
                    temp++;
                    continue;
                }
                if (!s.trim().isEmpty()) {
                    if (temp == 1) {
                        size += disposeUnit(s);
                        temp++;
                    } else {
                        used += disposeUnit(s);
                        temp = 0;
                    }
                }
            }
        }
        return new StringBuilder().append("大小 ").append(size).append("G , 已使用").append(used).append("G ,空闲")
                .append(size - used).append("G").toString();
    }

    /**
     * 处理单位转换
     * K/KB/M/T 最终转换为G 处理
     *
     * @param s 带单位的数据字符串
     * @return 以G 为单位处理后的数值
     */
    private static int disposeUnit(String s) {

        try {
            s = s.toUpperCase();
            String lastIndex = s.substring(s.length() - 1);
            String num = s.substring(0, s.length() - 1);
            int parseInt = Integer.parseInt(num);
            if (lastIndex.equals("G")) {
                return parseInt;
            } else if (lastIndex.equals("T")) {
                return parseInt * 1024;
            } else if (lastIndex.equals("M")) {
                return parseInt / 1024;
            } else if (lastIndex.equals("K") || lastIndex.equals("KB")) {
                return parseInt / (1024 * 1024);
            }
        } catch (NumberFormatException e) {
            e.printStackTrace();
            return 0;
        }
        return 0;
    }
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值