之前通过两篇博文详细介绍过BML这个从BGP数据流中提取特征的工具,大家有兴趣可以去阅读一下博文:BML-从BGP数据流中快速收集你需要的特征;BML:快速构建BGP特征的深度解析。今天,我主要是将BML工具的源代码进行简单描述,以方便对照学习。
1. BML.data
BML.data模块主要功能是进行数据收集与预处理。
1.1 updates模块
updates模块的主要功能就是解析接收到的BGP updates数据流。
#!/usr/bin/env python3
import time, json
from pybgpstream import BGPStream, BGPRecord, BGPElem
from BML import utils
def serialize_sets(obj):
if isinstance(obj, set):
return list(obj)
return obj
class UpdatesDump(utils.BmlProcess):
def __init__(self, start, end, params, outFolder, logFiles):
utils.BmlProcess.__init__(self, logFiles)
self.startTime = start
self.endTime = end
self.stream = BGPStream()
self.filePath = self.getFilePath(outFolder)
self.params = {
"Projects": ['ris','routeviews'],
"Collectors": [],
"IpVersion": [4,6],
"UseRibs": False,
}
self.setParams(params)
def startStream(self):
if(self.startTime!=-1 and self.endTime!=-1):
if(self.params["UseRibs"]):
self.stream = BGPStream(
from_time=self.startTime, until_time=self.endTime,
collectors=self.params["Collectors"],
projects=self.params["Projects"]
)
else:
self.stream = BGPStream(
from_time=self.startTime, until_time=self.endTime,
collectors=self.params["Collectors"],
record_type="updates",
projects=self.params["Projects"]
)
else:
quit("Error: can't start stream, interval not set")
def buildUpdatesDump(self):
self.startProgress = self.startTime
self.endProgress = self.endTime
self.emptyFile()
for record in self.stream.records():
if record.status == "valid":
for elem in record:
self.printProgress(elem.time)
if(elem.type=='A' or elem.type=='W' or elem.type=='R'):
if(utils.ipVersion(elem.fields['prefix']) in self.params["IpVersion"]):
u = {}
u['collector'] = str(record.collector)
u['dump_time'] = str(record.dump_time)
u['type'] = str(elem.type)
u['time'] = str(int(elem.time))
u['peer_address'] = str(elem.peer_address)
u['peer_asn'] = str(elem.peer_asn)
u['fields'] = json.dumps(elem.fields, default=serialize_sets)
self.appendToFile(u)
def getFilePath(self, path):
return(utils.mkdirPath(path) + "updates.csv")
def emptyFile(self):
file = open(self.filePath,"w")
file.write("collector,dump_time,type,time,peer_address,peer_asn,fields" + '\n')
file.close()
return(self.filePath)
def appendToFile(self, u):
file = open(self.filePath,"a")
file.write(u['collector']+","+u['dump_time']+","+u['type']+","+u['time']+","+u['peer_address']+","+u['peer_asn']+","+u['fields'] + '\n')
file.close()
return(self.filePath)
def execute(self):
timeAtStart = time.time()
self.log("###############")
self.log("# Updates dump")
self.log("###############")
self.log("Start time: " + str(self.startTime))
self.log("End time: " + str(self.endTime))
self.log("Duration: " + utils.timeFormat(self.endTime-self.startTime))
self.printParams()
self.startStream()
self.buildUpdatesDump()
self.log("Computation time: " + utils.timeFormat(time.time()-timeAtStart))
self.log("Updates dump saved to: " + self.filePath)
def dumpUpdates(start, end, outfolder, params=None, logFiles=None):
if(params is None):
params = {}
if(logFiles is None):
logFiles = []
logFile = open(utils.mkdirPath(outfolder)+"updates_dump.log",'w')
logFiles.append(logFile)
updatesDump = UpdatesDump(start, end, params, outfolder, logFiles)
updatesDump.execute()
logFile.close()
return(updatesDump.filePath)
1.2 routes模块
routes模块生成路由快照,并且同步打印日志。
#!/usr/bin/env python3
import time, json
from BML import utils
def updateRoutes(routes, u):
if(u['type']=='A' or u['type']=='W' or u['type']=='R'):
collector = u['collector']
peer = u['peer_asn']
prefix = u['fields']['prefix']
if(u['type']=='A' or u['type']=='R'):
if(prefix not in routes):
routes[prefix] = {}
if(collector not in routes[prefix]):
routes[prefix][collector] = {}
routes[prefix][collector][peer] = u['fields']['as-path']
elif(u['type']=='W'):
if(prefix in routes and collector in routes[prefix] and peer in routes[prefix][collector]):
del routes[prefix][collector][peer]
if(len(routes[prefix][collector])==0):
del routes[prefix][collector]
if(len(routes[prefix])==0):
del routes[prefix]
return(routes)
def parseUpdate(lineSplited, header):
u= {}
u['collector'] = lineSplited[header['collector']]
u['dump_time'] = lineSplited[header['dump_time']]
u['type'] = lineSplited[header['type']]
u['time'] = lineSplited[header['time']]
u['peer_address'] = lineSplited[header['peer_address']]
u['peer_asn'] = lineSplited[header['peer_asn']]
u['fields'] = json.loads(",".join(lineSplited[header['fields']::]))
return(u)
def getUpdatesInfos(updates):
headerLine = ""
firstLine = ""
lastLine = ""
if(type(updates)==str):
lines = open(updates)
else:
lines = updates
i = 0
for line in lines:
if(i==0):
headerLine = line[:-1].split(',')
if(i==1):
firstLine = line[:-1].split(',')
i+= 1
lastLine = line[:-1].split(',')
header = utils.getIndexList(headerLine)
startTime = int(parseUpdate(firstLine, header)["time"])
endTime = int(parseUpdate(lastLine, header)["time"])
if(type(updates)==str):
lines.close()
return(header, startTime, endTime)
class RoutesSnapshot(utils.BmlProcess):
def __init__(self, updates, routes, outfolder, logFiles):
utils.BmlProcess.__init__(self, logFiles)
self.routes = routes
self.updates = updates
self.filePath = self.getFilePath(outfolder)
def buildRoutesSnapshot(self):
header, self.startProgress, self.endProgress = getUpdatesInfos(self.updates)
if(type(self.updates)==str):
lines = open(self.updates)
else:
lines = self.updates
i=0
for line in lines:
if(i!=0):
u= parseUpdate(line[:-1].split(','), header)
self.printProgress(int(u['time']))
self.routes = updateRoutes(self.routes, u)
i+=1
if(type(self.updates)==str):
lines.close()
def getFilePath(self, path):
if(path is None):
return(None)
else:
return(utils.mkdirPath(path) + "routes.json")
def save(self):
file = open(self.filePath,"w")
json.dump(self.routes, file)
file.close()
def execute(self):
timeAtStart = time.time()
self.log("#################")
self.log("# Route Snapshot")
self.log("#################")
if(type(self.updates)==str):
self.log("Updates file: " + self.updates)
else:
self.log("Nb. of updates: " + str(len(self.updates)))
self.log("Nb. of prefix in initial route snapshot: " + str(len(self.routes)))
self.buildRoutesSnapshot()
self.log("Computation time: " + utils.timeFormat(time.time()-timeAtStart))
self.log("Nb. of prefix in route snapshot: " + str(len(self.routes.keys())))
if(self.filePath!=None):
self.save()
self.log("Route snapshot saved to: " + self.filePath)
def dumpRoutes(updates, routes=None, outFolder=None, logFiles=None):
if(routes is None):
routes = {}
if(logFiles is None):
logFiles = []
if(outFolder!=None):
logFile = open(utils.mkdirPath(outFolder)+"routes_snapshot.log",'w')
logFiles.append(logFile)
routesSnapshot = RoutesSnapshot(updates, routes, outFolder, logFiles)
routesSnapshot.execute()
if(outFolder!=None):
logFile.close()
return(routesSnapshot.routes, routesSnapshot.filePath)
1.3 dataset模块
dataset模块是BML.data模块中主要执行模块,用来启动数据收集程序,也会同步输出日志。
#!/usr/bin/env python3
import sys, os, random
import json
from BML import utils
from BML.data.updates import dumpUpdates
from BML.data.routes import dumpRoutes
def collectData(label, start_time, end_time, name, folder, params, logFiles=[]):
e_folder = utils.mkdirPath(folder + label + os.sep + str(name))
logFiles.append(open(e_folder + "log_collect_sample.log", "w"))
utils.printAndLog("##################", logFiles)
utils.printAndLog("# Collect sample", logFiles)
utils.printAndLog("#################", logFiles)
utils.printAndLog("Name: {}".format(name), logFiles)
utils.printAndLog("Label: {}".format(label), logFiles)
utils.printAndLog("Start time: {}".format(start_time), logFiles)
utils.printAndLog("End time: {}".format(end_time), logFiles)
utils.printAndLog("**************************", logFiles)
utils.printAndLog("* Priming data collection", logFiles)
utils.printAndLog("**************************", logFiles)
if(os.path.exists(e_folder + "priming_data" + os.sep +"routes.json.gz") and params["SkipIfExist"]):
utils.printAndLog("Data exists, skipped", logFiles)
else:
# Priming data collection
paramsUpdate = {
"Projects": params["Projects"],
"Collectors": params["Collectors"],
"IpVersion": params["IpVersion"],
"UseRibs": params["UseRibsPriming"],
}
primingUpdatesFile = dumpUpdates(start_time-params["PrimingPeriod"]*60, start_time, e_folder + "priming_data", params=paramsUpdate, logFiles=logFiles[:])
_ , primingDumpFile = dumpRoutes(primingUpdatesFile, routes={}, outFolder=e_folder + "priming_data", logFiles=logFiles[:])
utils.gzipFile(primingDumpFile, remove=True)
os.remove(primingUpdatesFile)
utils.printAndLog("********************", logFiles)
utils.printAndLog("* Data collection", logFiles)
utils.printAndLog("********************", logFiles)
if(os.path.exists(e_folder + "data" + os.sep +"updates.csv.gz") and params["SkipIfExist"]):
utils.printAndLog("Data exists, skipped", logFiles)
else:
# Data collection
paramsUpdate = {
"Projects": params["Projects"],
"Collectors": params["Collectors"],
"IpVersion": params["IpVersion"],
"UseRibs": params["UseRibsData"],
}
updatesFilePath = dumpUpdates(start_time, end_time, e_folder + "data", params=paramsUpdate, logFiles=logFiles[:])
utils.gzipFile(updatesFilePath, remove=True)
class Dataset():
def __init__(self, folder):
self.params = {
"Projects" : ['ris','routeviews'],
"Collectors" : [],
"IpVersion" : [4,6],
"PrimingPeriod" : 1*60,
"UseRibsPriming" : False,
"UseRibsData" : False,
"SkipIfExist" : True
}
self.folder= folder
def setParams(self, params):
for k,v in params.items():
if(k in self.params):
self.params[k] = v
else:
sys.exit("Unrecognized parameter:"+k)
def setPeriodsOfInterests(self, periodsOfInterests):
self.periodsOfInterests = periodsOfInterests
def getJobs(self):
jobs = []
for period in self.periodsOfInterests:
params = self.params.copy()
if("params" in period):
for k,v in period["params"].items():
if(k in params):
params[k] = v
else:
sys.exit("Unrecognized parameter:"+k)
j = {
'includes' : "from BML.data.dataset import collectData",
'target': "collectData",
'args': (period["label"], period["start_time"], period["end_time"], period["name"], self.folder, params),
'kwargs': {'logFiles':["LOG_ONLY"]}
}
jobs.append(j)
random.shuffle(jobs)
return(jobs)
2. BML.utils
BML.utils的主要功能是启动多进行、执行作业和打印日志等。
2.1 processing_queue模块
processing_queue是进行多进程管控的模块,默认开启16进程。
import multiprocessing, time
class ProcessingQueue(object):
"""Summary
Attributes:
finish (list): Description
nbProcess (TYPE): Description
processes (list): Description
queue (list): Description
running (list): Description
"""
def __init__(self, nbProcess=16):
"""Summary
Args:
nbProcess (int, optional): Description
"""
self.queue = []
self.running = []
self.finish = []
self.processes = []
self.nbProcess = nbProcess
for i in range(self.nbProcess):
self.processes.append(None)
self.running.append(None)
def stop(self):
for i in range(self.nbProcess):
self.processes[i].terminate()
def runOnce(self):
processesAlive = False
for i in range(self.nbProcess):
if(not self.processes[i] is None and self.processes[i].is_alive()):
processesAlive = True
else:
if(self.running[i]!=None):
self.finish.append(self.running[i])
self.running[i] = None
#if(self.processes[i].exitcode!=0):
# sys.exit("Subprocess terminated with exit code %i, execution stoped" % (self.processes[i].exitcode))
if(len(self.queue)>0):
(target, args, kwargs) = self.queue[0]
self.processes[i] = multiprocessing.Process(target=target, args=args, kwargs=kwargs)
self.processes[i].start()
processesAlive = True
self.running[i] = (target, args, kwargs)
self.queue.pop(0)
return(processesAlive)
def waitUntilFree(self):
while True:
for i in range(self.nbProcess):
if(self.processes[i] is None or not self.processes[i].is_alive()):
return
time.sleep(1)
def join(self):
for i in range(self.nbProcess):
if(not self.processes[i] is None):
self.processes[i].join()
def run(self, logFilePath=""):
"""Summary
Args:
logFilePath (str, optional): Description
"""
processesAlive = False
try:
while len(self.queue)>0 or processesAlive:
processesAlive = self.runOnce()
self.runLog(logFilePath)
time.sleep(1)
for i in range(self.nbProcess):
if(not self.processes[i] is None):
self.processes[i].join()
self.processes[i].close()
except Exception as e:
for i in range(self.nbProcess):
if(not self.processes[i] is None):
self.processes[i].terminate()
raise(e)
def addProcess(self, target=None, args=(), kwargs={}):
"""Summary
Args:
target (None, optional): Description
args (tuple, optional): Description
kwargs (dict, optional): Description
"""
self.queue.append((target, args, kwargs))
def formatLog(self, listP):
"""Summary
Args:
listP (TYPE): Description
Returns:
TYPE: Description
"""
i_space = len(str(len(listP)))
t_space = len("Function")
a_space = len("Args")
kw_space = len("Kwargs")
log = ""
for i in range(len(listP)):
if(listP[i]!=None):
(target, args, kwargs) = listP[i]
t_space = len(str(target.__name__)) if len(str(target.__name__))>t_space else t_space
a_space = len(str(args)) if len(str(args))>a_space else a_space
kw_space = len(str(kwargs)) if len(str(kwargs))>kw_space else kw_space
vline = ("="*(t_space+a_space+kw_space+13)) + "\n"
log+= vline
log += ("|{:<"+str(i_space)+"s}| {:"+str(t_space)+"s} | {:"+str(a_space)+"s} | {:"+str(kw_space)+"s} | \n").format("#","Function","Args","Kwargs")
descr = "|{:<"+str(i_space)+"d}| {:"+str(t_space)+"s} | {:"+str(a_space)+"s} | {:"+str(kw_space)+"s} | \n"
log+= vline
for i in range(len(listP)):
if(listP[i]!=None):
(target, args, kwargs) = listP[i]
log += descr.format(i, target.__name__, str(args), str(kwargs))
else:
log += descr.format(i, "Empty", "", "")
log += vline
return(log)
def runLog(self, logFilePath):
"""Summary
Args:
logFilePath (TYPE): Description
"""
if(not logFilePath==""):
log = "#######################\n"
log += "# Queue : Running \n"
log += "#######################\n"
log += self.formatLog(self.running) + "\n"
log += "#######################\n"
log += "# Queue : Waiting \n"
log += "#######################\n"
log += self.formatLog(self.queue) + "\n"
log += "#######################\n"
log += "# Queue : Finish \n"
log += "#######################\n"
log += self.formatLog(self.finish) + "\n"
with open(logFilePath, "w") as file:
file.write(log)
file.close()
2.2 bml_process模块
bml_process模块是打印进程情况。
import sys
from BML import utils
class BmlProcess():
def