新开发的PCIE设备在做服务器适配时,有时需要服务器厂家更新BMC或BIOS固件。同时,我们也希望对PCIE设备做一些检测,最后收集一些信息存档。如果需要处理的服务器很多,通过BMC的界面进行人工操作就会比较麻烦。以下提供了一个脚本,供参考。
主要思路:
- 采用haneWIN NFS Server搭建一个NFS服务,目录为nfs,里面存放着boot.iso(设备检测镜像)
- 通过redfish协议登录BMC,获取PCIE设备信息,服务器信息,升级固件,重启服务器,挂iso,设置启动方式
- 截屏获取KVM的内容,通过图片相似度的方法判断ISO里的检测程序是否运行完成.
版本信息
属性 | 值 |
---|---|
NFS服务器 | haneWIN NFS Server |
服务器型号 | NF5270M6 |
代码
# -*- coding: utf-8 -*-
from queue import Queue
from skimage.metrics import structural_similarity
import traceback
import cv2
import codecs
import csv
import argparse
import shutil
import json
import time
import redfish
import sys
import os
import uuid
import threading
import warnings
warnings.filterwarnings("ignore")
import logging
parser = argparse.ArgumentParser()
parser.add_argument('-server_list', type=str,
required=True, help="server_list")
parser.add_argument('-nfs_server', type=str, required=True, help="nfs server")
parser.add_argument('-threads', type=int, required=True, help="nfs server")
parser.add_argument('-checkonly', type=int, required=True, help="check only")
args = parser.parse_args()
class TimeSpan:
"""统计代码段的耗时
"""
def __init__(self,logger,prefix=""):
self.prefix = prefix
self.logger=logger
def __enter__(self):
self.end = None
self.start = time.time()
def __exit__(self, exc_type, exc_val, exc_tb):
self.end = time.time()
interval = self.end - self.start
unit = "sec"
if interval > 60:
unit = "min"
interval = interval/60
self.logger.info('%-64s:%.3f(%s)' % (self.prefix, interval, unit))
def isSimilarity(filename):
"""判断自检程序是否运行完成
Args:
filename ([string]): [截屏图片路径]
Returns:
[bool]: [是否完成]
"""
last_image = cv2.imread('target_image.jpg', cv2.IMREAD_GRAYSCALE)
img = cv2.imread(filename, cv2.IMREAD_GRAYSCALE)
cell_h, cell_w = last_image.shape
hoff = 280
h, w = img.shape
while hoff < 320:
img2 = img[hoff:hoff+cell_h, 0:cell_w]
ssim = structural_similarity(last_image, img2)
if ssim > 0.5:
# cv2.imwrite("{}_{}.jpg".format(hoff,int(ssim)),img2)
return True
hoff += 3
return False
class RedFishProxy:
def __init__(self,handle,retry_count=3):
self.handle=handle
self.retry_count=retry_count
def post(self, path, args=None, body=None, headers=None):
count=0
while True:
response=self.handle.post(path,args,body,headers)
if response._status == 500 and count<self.retry_count:
time.sleep(2)
count+=1
continue
else:
return response
def get(self, path, args=None, headers=None):
count=0
while True:
response=self.handle.get(path,args,headers)
if response._status == 500 and count<self.retry_count:
time.sleep(2)
count+=1
continue
else:
return response
def delete(self,path, args=None, headers=None):
count=0
while True:
response=self.handle.delete(path,args,headers)
if response._status == 500 and count<self.retry_count:
time.sleep(2)
count+=1
continue
else:
return response
def patch(self, path, args=None, body=None, headers=None):
count=0
while True:
response=self.handle.patch(path,args,body,headers)
if response._status == 500 and count<self.retry_count:
time.sleep(2)
count+=1
continue
else:
return response
class InspurVA1Query:
def __init__(self,logger,index,bmc_host, username, password, nfs_server, try_count):
self.logger=logger
self.nfs_server = nfs_server
self.username = username
self.password = password
self.try_count = try_count
self.bmc_host = bmc_host
self.seq = 0
self.token=None
self.index=index
self.redfish_client=RedFishProxy(redfish.redfish_client(base_url=self.bmc_host, username=self.username, password=self.password))
def Login(self):
url = '/redfish/v1/SessionService/Sessions'
req_body = {"UserName": self.username,
"Password": self.password, "SessionTimeOut": 300}
req_headers = {"Content-Type": "application/json"}
response = self.redfish_client.post(
url, headers=req_headers, body=req_body)
if response._status == 201:
session = json.loads(response._read.decode())
self.token = session["Oem"]['Public']['X-Auth-Token']
self.Id = session["Id"]
return True
else:
self.logger.error("Thermal:{}".format(response))
return False
def Logout(self):
if self.token:
url = '/redfish/v1/SessionService/Sessions/{}'.format(self.Id)
req_headers = {"X-Auth-Token": self.token}
response = self.redfish_client.delete(url, headers=req_headers)
self.token=None
if response._status == 200:
return True
self.logger.error("Logout:{}".format(response))
return False
else:
return False
def QueryMedia(self):
url = '/redfish/v1/Managers/1/VirtualMedia/CD'
req_headers = {"X-Auth-Token": self.token}
response = self.redfish_client.get(url, headers=req_headers)
self.logger.info("QueryMedia:{}".format(response))
return True
def IsMounted(self):
"""查询是否已经加载
"""
url = '/redfish/v1/Managers/1/VirtualMedia/CD'
req_headers = {"X-Auth-Token": self.token}
response = self.redfish_client.get(url, headers=req_headers)
if response._status == 200:
body = json.loads(response._read.decode())
if 'Inserted' in body:
return body['Inserted']
return False
self.logger.error("IsMounted:{}".format(response))
return False
def InsertMedia(self):
url = '/redfish/v1/Managers/1/VirtualMedia/CD/Actions/VirtualMedia.InsertMedia'
req_headers = {"X-Auth-Token": self.token}
req_body = {"TransferProtocolType": 'NFS',
"Image": '{}/nfs/boot.iso'.format(self.nfs_server)}
response = self.redfish_client.post(
url, headers=req_headers, body=req_body)
if response._status == 200:
while True:
if self.IsMounted():
return True
self.logger.error("InsertMedia:{}".format(response))
return False
def EnableVirtualCDBoot(self,timeout=400):
url = '/redfish/v1/Systems/1/Bios'
req_headers = {"X-Auth-Token": self.token}
response = self.redfish_client.get(url, headers=req_headers)
if response._status == 200:
ETag = response.getheader("ETag") #获取 ETag
body = json.loads(response._read.decode())
# with open("{}_{}.json".format(self.bmc_host,uuid.uuid4().hex[:8]),"w") as f:
# f.write(json.dumps(body))
bootOrderName=['UefiBootOrder1','UefiBootOrder2','UefiBootOrder3','UefiBootOrder4']
cureOrder=""
for name in bootOrderName:
if name in body['Attributes'] and body['Attributes'][name].find('CD/DVD')>=0:
cureOrder=name
break
if body['Attributes']['FixedBootOrderEn']=='Disabled' and cureOrder!="":
self.logger.info("CDBoot Already Enable:{} {} {}".format(body['Attributes']['FixedBootOrderEn'],cureOrder,body['Attributes'][cureOrder]))
return True
url = '/redfish/v1/Systems/1/Bios/Settings'
req_headers = {"X-Auth-Token": self.token,"If-Match":ETag}
req_body = {}
req_body['Attributes']={}
req_body['Attributes']['FixedBootOrderEn']='Disabled'
if cureOrder=="":
req_body['Attributes']['UefiBootOrder4']="CD/DVD:UEFI: AMI Virtual CDROM0 1.00" #最后一个启动项设置为CD启动,不影响正常的启动
response = self.redfish_client.patch(url, headers=req_headers, body=req_body)
if response._status in [200]:
if not self.ComputerSystemReset("ForceRestart"):
return False
beg=time.time()
ii=0
while True:
time.sleep(2)
cur=time.time()
if cur-beg>timeout:
self.logger.error("EnableVirtualCDBoot Timeout:{}".format(cur-beg))
return False
# url = '/redfish/v1/Systems/1/Bios/Settings'
# req_headers = {"X-Auth-Token": self.token}
# response = self.redfish_client.get(url, headers=req_headers)
# if response._status == 200:
# body = json.loads(response._read.decode())
# print(body)
# if 'Attributes' not in body:
# return True
url = '/redfish/v1/Systems/1/Bios'
req_headers = {"X-Auth-Token": self.token}
response = self.redfish_client.get(url, headers=req_headers)
if response._status == 200:
body = json.loads(response._read.decode())
cureOrder=""
for name in bootOrderName:
if name in body['Attributes'] and body['Attributes'][name].find('CD/DVD')>=0:
cureOrder=name
break
if body['Attributes']['FixedBootOrderEn']=='Disabled' and cureOrder!="":
self.logger.info("EnableVirtualCDBoot Finished:{} {} {}".format(cur-beg,cureOrder,body['Attributes'][cureOrder]))
return True
else:
self.logger.error("QueryBiosSetting2:{}".format(response))
else:
self.logger.error("BiosSetting:{}".format(response))
else:
self.logger.error("QueryBiosSetting1:{}".format(response))
return False
def SetBootOrder(self):
url = '/redfish/v1/Systems/1'
req_headers = {"X-Auth-Token": self.token}
response = self.redfish_client.get(url, headers=req_headers)
if response._status == 200:
ETag = response.getheader("ETag")
body = json.loads(response._read.decode())
AssetTag = body['AssetTag']
IndicatorLED = body['IndicatorLED']
HostName = body['HostName']
req_headers = {"X-Auth-Token": self.token, "If-Match": ETag}
req_body = {}
# req_body['AssetTag']=AssetTag
# req_body['IndicatorLED']=IndicatorLED
# req_body['HostName']=HostName
req_body['Boot'] = {}
req_body['Boot']['BootSourceOverrideTarget'] = 'Cd'
req_body['Boot']['BootSourceOverrideEnabled'] = 'Once'
req_body['Boot']['BootSourceOverrideMode'] = 'UEFI'
response = self.redfish_client.patch(
url, headers=req_headers, body=req_body)
if response._status == 200:
for i in range(10):
url = '/redfish/v1/Systems/1'
req_headers = {"X-Auth-Token": self.token}
response = self.redfish_client.get(url, headers=req_headers)
if response._status == 200:
body = json.loads(response._read.decode())
BootSourceOverrideTarget=body['Boot']['BootSourceOverrideTarget']
if BootSourceOverrideTarget=="Cd":
return True
self.logger.error("SetBootOrder:{}".format(response))
else:
self.logger.error("SetBootOrder:{}".format(response))
return False
def SetBootOrderLegacy(self):
url = '/redfish/v1/Systems/1'
req_headers = {"X-Auth-Token": self.token}
response = self.redfish_client.get(url, headers=req_headers)
if response._status == 200:
ETag = response.getheader("ETag")
body = json.loads(response._read.decode())
AssetTag = body['AssetTag']
IndicatorLED = body['IndicatorLED']
HostName = body['HostName']
req_headers = {"X-Auth-Token": self.token, "If-Match": ETag}
req_body = {}
# req_body['AssetTag']=AssetTag
# req_body['IndicatorLED']=IndicatorLED
# req_body['HostName']=HostName
req_body['Boot'] = {}
req_body['Boot']['BootSourceOverrideTarget'] = 'Hdd'
req_body['Boot']['BootSourceOverrideEnabled'] = 'Continuous'
req_body['Boot']['BootSourceOverrideMode'] = 'UEFI' #UEFI Legacy
response = self.redfish_client.patch(
url, headers=req_headers, body=req_body)
if response._status == 200:
body = json.loads(response._read.decode())
print(body)
for i in range(10):
url = '/redfish/v1/Systems/1'
req_headers = {"X-Auth-Token": self.token}
response = self.redfish_client.get(url, headers=req_headers)
if response._status == 200:
body = json.loads(response._read.decode())
BootSourceOverrideMode=body['Boot']['BootSourceOverrideMode']
print("BootSourceOverrideMode:",BootSourceOverrideMode)
if BootSourceOverrideMode=="UEFI":
return True
else:
self.logger.error("SetBootOrder:{}".format(response))
else:
self.logger.error("SetBootOrder:{}".format(response))
return False
def EjectMedia(self):
if self.IsMounted():
url = '/redfish/v1/Managers/1/VirtualMedia/CD/Actions/VirtualMedia.EjectMedia'
req_headers = {"X-Auth-Token": self.token}
req_body = {"TransferProtocolType": 'NFS', "ImageName": 'boot.iso'}
response = self.redfish_client.post(
url, headers=req_headers, body=req_body)
if response._status in [200, 500]:
while True:
if not self.IsMounted():
return True
self.logger.error("EjectMedia:{}".format(response))
return False
return True
def ComputerSystemReset(self, ResetType):
url = '/redfish/v1/Systems/1/Actions/ComputerSystem.Reset'
req_headers = {"X-Auth-Token": self.token}
req_body = {"ResetType": ResetType}
response = self.redfish_client.post(url, headers=req_headers, body=req_body)
if response._status == 200:
body = json.loads(response._read.decode())
return True
self.logger.error("ComputerSystemReset:{}".format(response))
return False
def ChassisReset(self, ResetType,retry_count=3):
for i in range(retry_count):
url = '/redfish/v1/Chassis/1/Actions/Chassis.Reset'
req_headers = {"X-Auth-Token": self.token}
req_body = {"ResetType": ResetType}
response = self.redfish_client.post(url, headers=req_headers, body=req_body)
if response._status == 200:
body = json.loads(response._read.decode())
return True
self.logger.error("ChassisReset[{}-{}]:{}".format(i,ResetType,response))
time.sleep(2)
return False
def WaitFinished(self,timeout=5*60):
beg = time.time()
snap_count=0
while True:
time.sleep(3) #频率不宜太快,否则容易导致系统出问题
cur = time.time()
if cur-beg > timeout:
self.logger.error("WaitFinished,Timeout")
return False
url = '/redfish/v1/Managers/1/Actions/Oem/Public/KVM/Screenshot'
req_headers = {"X-Auth-Token": self.token}
snap_count+=1
response = self.redfish_client.post(url, headers=req_headers)
if response._status == 200:
retry_count=10 #如果正在生成,不要再触发抓图,等待抓屏完成,否则会导致黑屏
while retry_count>0:
cur = time.time()
if cur-beg > timeout:
self.logger.error("WaitFinished,Timeout")
return False
url = '/redfish/v1/Managers/1/Actions/Oem/Public/KVM/ScreenshotDownload'
req_headers = {"X-Auth-Token": self.token}
req_body = {"PictureAttributes": 'manual'}
response = self.redfish_client.post(url, headers=req_headers, body=req_body)
#if response._status != 200:
# print("{}-{} {}:{}".format(snap_count,retry_count,response._status,json.loads(response._read.decode("utf-8","ignore"))))
if response._status == 404: # The file is being generated
time.sleep(2)
retry_count-=1
continue
elif response._status == 500: # There are no manual pictures at present
break
elif response._status == 401: # Invalid Authentication
break
elif response._status == 200:
image_path = "{}-{}-{}-ing.jpg".format(self.index,self.bmc_host, self.seq)
with open(image_path, "wb") as f:
f.write(response.read)
if isSimilarity(image_path):
shutil.move(image_path, self.result_image)
return True
break
else:
self.logger.error("ScreenshotDownload:{}".format(response))
break
else:
self.logger.error("Screenshot:{}".format(response))
def PCIEDeviceSummary(self,target_dev_count=3):
"""获取PCIE链路信息
"""
url = '/redfish/v1/Systems/1'
req_headers = {"X-Auth-Token": self.token}
response = self.redfish_client.get(url, headers=req_headers)
if response._status != 200:
self.logger.error("Systems:{}".format(response))
return False
url = '/redfish/v1/Systems/1/Bios'
req_headers = {"X-Auth-Token": self.token}
response = self.redfish_client.get(url, headers=req_headers)
if response._status != 200:
self.logger.error("Bios:{}".format(response))
return False
url = '/redfish/v1/Chassis/1/PCIeDevices'
req_headers = {"X-Auth-Token": self.token}
response = self.redfish_client.get(url, headers=req_headers)
if response._status == 200:
body = json.loads(response._read.decode())
count=body['Members@odata.count']
#如果没开机,则跳过
if count==0:
self.logger.info("{} PowerStatus=Off".format(self.bmc_host))
return True
#如果发现掉卡,返回失败
if count<target_dev_count:
self.logger.error("VA1 Lost,Current:{}".format(count))
return False
for i in range(count):
for _ in range(30):
url = '/redfish/v1/Chassis/1/PCIeDevices/{}'.format(i)
req_headers = {"X-Auth-Token": self.token}
response = self.redfish_client.get(url, headers=req_headers)
if response._status == 200:
body = json.loads(response._read.decode())
State=body['Status']['State']
Health=body['Status']['Health']
SlotNumber=body['Oem']['Public']['SlotNumber']
Manufacturer=body['Manufacturer']
if Manufacturer!="NVIDIA":
continue
url = '/redfish/v1/Chassis/1/PCIeDevices/{}/PCIeFunctions/1'.format(i)
req_headers = {"X-Auth-Token": self.token}
response = self.redfish_client.get(url, headers=req_headers)
if response._status == 200:
body = json.loads(response._read.decode())
LinkWidth=body['Oem']['Public']['LinkWidth']
LinkSpeed=body['Oem']['Public']['LinkSpeed']
self.logger.info("{} {} {} {} {} {} {}".format(i,Manufacturer,State,Health,SlotNumber,LinkWidth,LinkSpeed))
break
else:
time.sleep(1)
else:
time.sleep(1)
else:
self.logger.error("PCIeDevices1:{}".format(response))
return False
return True
def run(self):
"""测试序列
"""
# 测试次数
for i in range(self.try_count):
self.seq = i
self.result_image = "{}-{}-{}-done.jpg".format(self.index,self.bmc_host, self.seq)
#如果运行过,则跳过
if os.path.exists(self.result_image):
continue
try:
#统计总耗时
with TimeSpan(self.logger,"{}-{} InspurVA1QueryE2E:".format(self.bmc_host, self.seq)):
#创建RedFish会话,获取token
with TimeSpan(self.logger,"*1.{}-{}-Login".format(self.bmc_host, self.seq)):
self.Logout()
if not self.Login():
continue
#通过BMC查看卡数是否正常,如果掉卡直接返回失败
with TimeSpan(self.logger,"*2.{}-{}-PCIEDeviceSummary".format(self.bmc_host, self.seq)):
if not self.PCIEDeviceSummary():
self.logger.error("ERROR,{}".format(self.bmc_host))
continue
#服务器下电,防止虚拟光驱被占用,导致后续加载失败
with TimeSpan(self.logger,"*3.{}-{}-PowerDown".format(self.bmc_host, self.seq)):
if not self.ChassisReset("ForceOff"):
continue
#弹出虚拟光驱
with TimeSpan(self.logger,"*4.{}-{}-EjectMedia".format(self.bmc_host, self.seq)):
if not self.EjectMedia():
continue
#设置虚拟光驱NFS挂载参数
with TimeSpan(self.logger,"*5.{}-{}-InsertMedia".format(self.bmc_host, self.seq)):
if not self.InsertMedia():
continue
#服务器上电
with TimeSpan(self.logger,"*6.{}-{}-PowerOn".format(self.bmc_host, self.seq)):
if not self.ChassisReset("On"):
continue
#确认并开启虚拟光驱启动功能
with TimeSpan(self.logger,"*7.{}-{}-EnableVirtualCDBoot".format(self.bmc_host, self.seq)):
if not self.EnableVirtualCDBoot():
continue
#设置下一次从虚拟光驱启动
with TimeSpan(self.logger,"*8.{}-{}-SetBootOrder".format(self.bmc_host, self.seq)):
if not self.SetBootOrder():
continue
#重启,从光驱启动
with TimeSpan(self.logger,"*9.{}-{}-ForceRestart".format(self.bmc_host, self.seq)):
if not self.ComputerSystemReset("ForceRestart"):
return False
if i==0:
time.sleep(120)
continue
#KVM循环截屏,ISO中的检测程序完后会打印"Please press Enter to activate this console"
#通过计算图像的SSIM,判断测图片中是否出现了以上打印
#5分钟如果没有检测到,则超时退出
with TimeSpan(self.logger,"*10.{}-{}-WaitFinished".format(self.bmc_host, self.seq)):
if not self.WaitFinished():
continue
#弹出虚拟光驱
with TimeSpan(self.logger,"*11.{}-{}-EjectMedia".format(self.bmc_host, self.seq)):
if not self.EjectMedia():
continue
#服务器下电再上电
with TimeSpan(self.logger,"*12.{}-{}-PowerCycle".format(self.bmc_host, self.seq)):
if not self.ChassisReset('PowerCycle'):
continue
#注销RedFish会话
with TimeSpan(self.logger,"*13.{}-{}-Logout".format(self.bmc_host, self.seq)):
if not self.Logout():
continue
return True
except:
self.logger.error("{}-{} Failed:".format(self.bmc_host, self.seq))
traceback.print_exc(file=open('traceback_info.txt','a+'))
def FetchThread(checkonly,index,q):
"""自检任务线程
"""
logger=None
while True:
if q.empty():
time.sleep(0.1)
continue
row = q.get()
if row is None:
break
if logger is None:
logger = logging.getLogger("FetchThread:{}".format(index))
logger.setLevel(level = logging.INFO)
handler = logging.FileHandler("nvidia_aic_check_inspur_{}.log".format(index))
handler_ch=logging.StreamHandler()
handler.setLevel(logging.INFO)
handler_ch.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
handler_ch.setFormatter(formatter)
logger.addHandler(handler)
logger.addHandler(handler_ch)
index, bmc_addr, username, password, try_count = row
logger.info(row)
t = InspurVA1Query(logger,index,bmc_addr, username, password,
nfs_server, int(try_count))
if checkonly==1:
if t.Login() and t.PCIEDeviceSummary():
print("{} True".format(bmc_addr))
else:
print("{} False".format(bmc_addr))
else:
#t.run()
t.Login()
t.SetBootOrderLegacy()
t.ComputerSystemReset("ForceRestart")
if __name__ == '__main__':
nfs_server = args.nfs_server.strip()
server_list = args.server_list.strip()
threads = args.threads
checkonly=args.checkonly
request_queue = Queue(threads)
tasks = []
for i in range(threads):
t = threading.Thread(target=FetchThread, args=(checkonly,i,request_queue, ))
t.start()
tasks.append(t)
with codecs.open(server_list, "r", 'utf-8') as csvfile:
csvreader = csv.reader(csvfile)
next(csvreader)
for row in csvreader:
if row[0].startswith("#"):
continue
request_queue.put(row)
for i in range(threads):
request_queue.put(None)
for t in tasks:
t.join()
'''
重启BMC
curl -X POST https://192.168.1.100/redfish/v1/Systems/1/Actions/ComputerSystem.Reset -d '{"ResetType": "ForceRestart"}' -H "Content-Type: application/json" -k -u admin:admin
curl -X POST https://192.168.1.100/redfish/v1/Managers/1/Actions/Manager.Reset -d '{"ResetType": "ForceRestart"}' -H "Content-Type: application/json" -k -u admin:admin
curl -X POST https://192.168.1.100/redfish/v1/Managers/1/Actions/Manager.Reset -d '{"ResetType": "ForceRestart"}' -H "Content-Type: application/json" -k -u admin:admin
'''