原著的地址是:https://gist.githubusercontent.com/Dounm/881c9ddc364884fec41d0b539cb25153/raw/08b6938b58709c5900077f8cdc335caaa7c4ebe3/monitor_ib_traffic.py
这个脚本在执行的时候需要sudo权限。
这个脚本并不复杂,主要逻辑:
(1)主要逻辑是解析ibstat命令的输出结果,得到lid和port的列表。如果ib网卡不是active状态,则忽略这个ib卡。
(2)然后周期性使用/usr/sbin/perfquery -x -r $lid $pord获得这个周期的流量/这个周期的时长,得到速率
但呈现的样式不够直观
对此脚本的输出格式略作修改,更改如下:
import logging
import re
import sys
import json
import time
import subprocess
METRIC_NAMES = ["PortXmitData","PortRcvData"]
metrics = {}
def decode_str_list(line_list):
return [x.decode("utf-8") for x in line_list]
def get_cmd_out(cmd):
return decode_str_list(subprocess.Popen(cmd, stdout=subprocess.PIPE).stdout.readlines())
def ibstat_ports():
lid7port = []
ibstat = get_cmd_out("ibstat")
for index,line in enumerate(ibstat):
line = line.strip()
match = re.match("Port [0-9]\:",line)
if match:
number = line.split(' ')[1].replace(':','')
state = ibstat[index+1].split(':')[1].strip()
an = re.match("Active",state)
if an:
lid = ibstat[index+4].split(':')[1].strip()
lid7port.append((lid, number))
return lid7port
# Return a key-value pair, eventually empty if the line didn't match
def parse_counter_line(line, keys):
if re.match("^[a-zA-z0-9]*\:\.\.\.*[0-9]*$",line):
line = line.split(':')
key = line[0]
if key in keys:
value = line[1].replace('.','').strip()
return (key, int(value))
return ("",0)
# Parse the complete input from perfquery for lines matching counters,
# and return all counters and their values as dictionary
def parse_counters(counters, keys):
counts = {}
for line in counters:
key, value = parse_counter_line(line, keys)
# Omit empty return values...
if key:
logging.debug("[parse_counters] Found counter: %s=%s", key, value)
counts[key] = value
return counts
# Call perfquery for extended traffic counters, and reset the counters
def traffic_counter(lid, port = 1):
command = ["/usr/sbin/perfquery", "-x", "-r", lid, port]
logging.debug("[traffic_counters] Execute command: %s", " ".join(command))
counters = get_cmd_out(command)
return parse_counters(counters, METRIC_NAMES)
def init_metric():
metrics["last_update"] = time.time()
def update_metric():
global metrics
# NOTE: time_since_last_update is not calculated precisely
time_since_last_update = time.time() - metrics["last_update"]
if time_since_last_update < 10:
metrics["last_update"] = time.time()
return
logging.debug("[update_metrics] Update metrics after %ss", time_since_last_update)
for lid, port in ibstat_ports():
metric2counts = traffic_counter(lid, port)
metrics[lid] = {port: metric2counts}
for metric in METRIC_NAMES:
# Data port counters indicate octets divided by 4 rather than just octets.
#
# It's consistent with what the IB spec says (IBA 1.2 vol 1 p.948) as to
# how these quantities are counted. They are defined to be octets divided
# by 4 so the choice is to display them the same as the actual quantity
# (which is why they are named Data rather than Octets) or to multiply by
# 4 for Octets. The former choice was made.
#
# For simplification the values are multiplied by 4 to represent octets/bytes
num_bytes = metric2counts[metric] * 4
metrics[lid][port][metric.replace("Data", "Bytes")] = num_bytes
metrics[lid][port][metric.replace("Data", "GB/s")] = num_bytes / (time_since_last_update * 1024*1024*1024)
metrics["last_update"] = time.time()
if __name__ == '__main__':
logging.root.setLevel(logging.INFO)
update_interval = 10 if len(sys.argv) == 1 else sys.argv[1] # default is 10s
init_metric()
while True:
update_metric()
#print("Note: This is a **Rough** traffic monitor for Infiniband, the bw below may be bigger than real bw")
print(json.dumps(metrics, indent=2, sort_keys=True))
print("lid/port PortRcvBytes PortRcvData PortRcvGB/s PortXmitBytes PortXmitData PortXmitGB/s")
for lid,lidInfo in metrics.items():
if lid=="last_update":
continue
for port,portInfo in lidInfo.items():
prb=portInfo["PortRcvBytes"]
prd=portInfo["PortRcvData"]
prg=portInfo["PortRcvGB/s"]
pxb=portInfo["PortXmitBytes"]
pxd=portInfo["PortXmitData"]
pxg=portInfo["PortXmitGB/s"]
print("%s/%s %15d %15d %.10f %15d %15d %.10f" %(lid, port, prb, prd, prg, pxb, pxd, pxg))
time.sleep(update_interval)