有时需要做类似流处理的程序,对于存在分支、合并的处理函数式编程不很适合,用hadoop、spark、storm等工具可能又没太大必要。
做个简单的多协程协作处理框架,基于处理的模型,处理逻辑可以任意发挥。
#!/usr/bin/python
#coding:utf8
import sys
from collections import deque
try:
from enum import Enum
except:
class Enum(object):
def __init__(self,name,keys):
for i,key in enumerate(keys.split()):
setattr(self,key,i+1)
#其他文件import本模块时需要from cowork import cowork,MessageType
MessageType = Enum('MessageType', 'Start Data Request Close Stop')
#让多个函数协作运行
def cowork(config,variables=None):
#获得函数名称、工作函数对象、创建各协程对象
fname = sys._getframe().f_code.co_name
if not variables:
variables = getvars()
orignalworkers = [variables[name] for name in config]
doworkers = dict([(worker.__name__,workerengine(worker,config)) for worker in orignalworkers])
#创建消息队列、启动各协程、把启动消息放入消息队列
messagebuffer = deque()
for name,doworker in doworkers.items():
doworker.next()
messagebuffer.append((name,fname,MessageType.Start,None))
#循环从消息队列取消息、发给协程,协程有返回消息的话放入消息队列
while len(messagebuffer) > 0:
target,source,cmd,info = messagebuffer.popleft()
if debug:
sys.stderr.write( "to send " + str((target,source,cmd,info)) + "\n" )
messages = doworkers[target].send((target,source,cmd,info))
[messagebuffer.append(message) for message in messages if message]
#完成后发停止消息
for name,doworker in doworkers.items():
doworker.send((name,fname,MessageType.Start,None))
debug = 0
#在其他文件import,变量空间不同需要cowork的第二个参数,如果setvarfunc后就不需第二个参数
getvars = None
def setvarfunc(f):
global getvars
getvars = f
setvarfunc(lambda x=vars() : x) #可以把本行copy到其他文件中,调用cowork时只提供config参数即可
#调用worker的引擎,缺省实现了一些功能,worker只需实现必要的处理
def workerengine(worker,config):
#获得worker名称、对应配置、消息来源、发消息去向
fname = worker.__name__
workconfig = config.get(fname,{})
sourcenames = workconfig.get("requestfrom",[])
targetnames = workconfig.get("sendto",[])
result = []
while True:
#接受消息
target, sender, cmd, info = (yield result)
result = []
#调用worker,worker返回需要发送的消息数组,如果返回None采用缺省处理
messages = worker(workconfig, cmd, info, sender)
if messages != None:
#把消息条件到要返回的消息数组里
for receiver,cmd,info in messages:
result.append((receiver,fname,cmd,info))
elif cmd == MessageType.Stop:
#结束时退出循环
break
else:
if cmd == MessageType.Start:
#开始时,如果是不发出消息的worker,则向源发消息请求
if len(targetnames) == 0:
for sourcename in sourcenames:
result.append((sourcename,fname,MessageType.Request,None))
elif cmd == MessageType.Close:
#收到关闭某个源的消息,将其从接受消息的源表中去除
sourcenames.remove(sender)
elif cmd == MessageType.Request:
#收到消息请求,向源发出消息请求
for sourcename in sourcenames:
result.append((sourcename,fname,MessageType.Request,None))
elif cmd == MessageType.Data:
#收到消息时,转发给后面的接受者或打印到屏幕
if len(targetnames) > 0:
for targetname in targetnames:
result.append((targetname,fname,MessageType.Data,info))
else:
print info,
#转发后再向前请求消息
for sourcename in sourcenames:
result.append((sourcename,fname,MessageType.Request,None))
#从文件中读入数据,开始时打开文件、申请消息时读入返回消息、结束时关闭文件
def fromfile(workconfig, cmd, info, sender):
res = None
if cmd == MessageType.Start:
inputfiles = workconfig.get("inputfile",[])
if len(inputfiles) > 0:
workconfig["inputs"] = dict([(name,iter(open(name))) for name in inputfiles])
else:
workconfig["inputs"] = dict([("stdin",sys.stdin)])
elif cmd == MessageType.Stop:
[input.close() for input in workconfig["inputs"]]
elif cmd == MessageType.Request:
res = []
for name,input in workconfig["inputs"].items():
try:
line = input.next()
for targetname in workconfig.get("sendto",[]):
res.append((targetname,MessageType.Data,line))
except Exception,e:
input.close()
del input
if not workconfig["inputs"]:
res.append((targetname,MessageType.Close,line))
return res
#透传,不需做什么
def transfer(workconfig, cmd, info, sender):
return None
#把结果打印到屏幕,收到消息时print
def printer(workconfig, cmd, info, sender):
#if cmd == MessageType.Data:
# print info,
return None
#把结果写入文件,开始时打开文件、收到消息时写入、结束时关闭文件
def tofile(workconfig, cmd, info, sender):
res = None
if cmd == MessageType.Start:
workconfig["outputs"] = dict([(name,open(name,"w")) for name in workconfig.get("outputfile",[])])
elif cmd == MessageType.Stop:
[output.close() for output in workconfig["outputs"]]
elif cmd == MessageType.Data:
[output.write(info) for name,output in workconfig["outputs"].items()]
res = []
return res
if __name__ == '__main__':
infilenames = [sys.argv[1]] if len(sys.argv)>1 else []
outfilenames = [sys.argv[2]] if len(sys.argv)>2 else []
config = {
"fromfile" : {
"inputfile" : infilenames,
"sendto" : ["transfer"]
},
"transfer" : {
"requestfrom" : ["fromfile"],
"sendto" : ["tofile","printer"]
},
"tofile" : {
"requestfrom" : ["transfer"],
"outputfile" : outfilenames
},
"printer" : {
#"requestfrom" : ["transfer"]
},
}
cowork(config,vars())
测试
输入文件:
cat test.txt
1 3
4 5
8 9
执行命令:
python cowork.py test.txt test_copy.txt
1 3
4 5
8 9
同时生成test_copy.txt
计算拓扑图(后面出现多个分支时,后面节点不要重复给前面多次发送requst,只有一个节点发送即可):