监控ib网卡流量的脚本

原著的地址是:https://gist.githubusercontent.com/Dounm/881c9ddc364884fec41d0b539cb25153/raw/08b6938b58709c5900077f8cdc335caaa7c4ebe3/monitor_ib_traffic.py

这个脚本在执行的时候需要sudo权限。

这个脚本并不复杂,主要逻辑:

(1)主要逻辑是解析ibstat命令的输出结果,得到lid和port的列表。如果ib网卡不是active状态,则忽略这个ib卡。

(2)然后周期性使用/usr/sbin/perfquery -x -r $lid $pord获得这个周期的流量/这个周期的时长,得到速率

但呈现的样式不够直观

对此脚本的输出格式略作修改,更改如下:

import logging
import re
import sys
import json
import time
import subprocess


METRIC_NAMES = ["PortXmitData","PortRcvData"]
metrics = {}

def decode_str_list(line_list):
  return [x.decode("utf-8") for x in line_list]

def get_cmd_out(cmd):
    return decode_str_list(subprocess.Popen(cmd, stdout=subprocess.PIPE).stdout.readlines())

def ibstat_ports():
    lid7port = []
    ibstat = get_cmd_out("ibstat")
    for index,line in enumerate(ibstat):
        line = line.strip()
        match = re.match("Port [0-9]\:",line)
        if match:
            number = line.split(' ')[1].replace(':','')
            state = ibstat[index+1].split(':')[1].strip()
            an = re.match("Active",state)
            if an:
                lid = ibstat[index+4].split(':')[1].strip()
                lid7port.append((lid, number))
    return lid7port

# Return a key-value pair, eventually empty if the line didn't match
def parse_counter_line(line, keys):
    if re.match("^[a-zA-z0-9]*\:\.\.\.*[0-9]*$",line):
        line = line.split(':')
        key = line[0]
        if key in keys:
            value = line[1].replace('.','').strip()
            return (key, int(value))
    return ("",0)

# Parse the complete input from perfquery for lines matching counters,
# and return all counters and their values as dictionary
def parse_counters(counters, keys):
    counts = {}
    for line in counters:
        key, value = parse_counter_line(line, keys)
        # Omit empty return values...
        if key:
          logging.debug("[parse_counters] Found counter: %s=%s", key, value)
          counts[key] = value
    return counts

# Call perfquery for extended traffic counters, and reset the counters
def traffic_counter(lid, port = 1):
    command = ["/usr/sbin/perfquery", "-x", "-r", lid, port]
    logging.debug("[traffic_counters] Execute command: %s", " ".join(command))
    counters = get_cmd_out(command)
    return parse_counters(counters, METRIC_NAMES)

def init_metric():
    metrics["last_update"] = time.time()

def update_metric():
    global metrics

    # NOTE: time_since_last_update is not calculated precisely
    time_since_last_update = time.time() - metrics["last_update"]
    if time_since_last_update < 10:
        metrics["last_update"] = time.time()
        return 
    logging.debug("[update_metrics] Update metrics after %ss", time_since_last_update)

    for lid, port in ibstat_ports():
        metric2counts = traffic_counter(lid, port)
        metrics[lid] = {port: metric2counts}
        for metric in METRIC_NAMES:
            # Data port counters indicate octets divided by 4 rather than just octets.
            #
            # It's consistent with what the IB spec says (IBA 1.2 vol 1 p.948) as to
            # how these quantities are counted. They are defined to be octets divided
            # by 4 so the choice is to display them the same as the actual quantity
            # (which is why they are named Data rather than Octets) or to multiply by
            # 4 for Octets. The former choice was made.
            #
            # For simplification the values are multiplied by 4 to represent octets/bytes
            num_bytes = metric2counts[metric] * 4
            metrics[lid][port][metric.replace("Data", "Bytes")] = num_bytes
            metrics[lid][port][metric.replace("Data", "GB/s")] = num_bytes / (time_since_last_update * 1024*1024*1024)


    metrics["last_update"] = time.time()

if __name__ == '__main__':

    logging.root.setLevel(logging.INFO)
    update_interval = 10 if len(sys.argv) == 1 else sys.argv[1] # default is 10s

    init_metric()

    while True:
        update_metric()
        #print("Note: This is a **Rough** traffic monitor for Infiniband, the bw below may be bigger than real bw")
        print(json.dumps(metrics, indent=2, sort_keys=True))
        print("lid/port       PortRcvBytes            PortRcvData           PortRcvGB/s           PortXmitBytes          PortXmitData           PortXmitGB/s")
        for lid,lidInfo in metrics.items():
            if lid=="last_update":
                continue
            for port,portInfo in lidInfo.items():
                prb=portInfo["PortRcvBytes"]
                prd=portInfo["PortRcvData"]
                prg=portInfo["PortRcvGB/s"]
                pxb=portInfo["PortXmitBytes"]
                pxd=portInfo["PortXmitData"]
                pxg=portInfo["PortXmitGB/s"]
                print("%s/%s       %15d            %15d          %.10f        %15d     %15d        %.10f" %(lid, port, prb, prd, prg, pxb, pxd, pxg))


        time.sleep(update_interval)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值