让python程序中每个数据处理过程专注自己的逻辑,尽量不受输入输出的影响。参考了pipe包。
#!/usr/bin/python
#coding:utf-8
import sys
import itertools
'''
处理流程模仿 shell 流处理 各个处理环节用 | 符号连接
函数名模仿 sql 语句 (数据处理有顺序相关的缺省认为已排序,未排序的需先用sort函数排序)
例如:
nameage = ['zs\t10','ls\t12','ww\t13'] | split()
nameweight = ['zs\t11.1','ls\t12.2','ww\t13.3'] | split()
( nameage | join(nameweight,onindex([0],[0])) |
flatten(1,(None,None)) | flatten(0,(None,None)) |
selectindex(0,1,3) |
wheredata(lambda data:data[0]!=None) |
concat() |
foreach(printf)
)
结果:
zs 10 11.1
ls 12 12.2
ww 13 13.3
可以定义字段名,结果相同:
nameage = ['zs\t10','ls\t12','ww\t13'] | split() | asname('name,age')
nameweight = ['zs\t11.1','ls\t12.2','ww\t13.3'] | split() | asname('name,weight')
( nameage | join(nameweight,onkey('name','name')) | asname('na,nw') |
function(lambda na,nw:(getattr(na,'name'),getattr(na,'age'),getattr(nw,'name'),getattr(nw,'weight'))) |
asname('name,age,nwname,weight') |
select('name,age,weight') |
where(lambda name,age,weight:name!=None) |
concat() |
foreach(printf)
)
'''
class instream:
def __init__(self, function):
'''在装饰函数定义时调用,把函数封装在 instream 对象中'''
self.function = function
def __ror__(self, preiteration):
return self.function(preiteration)
def __call__(self, *args, **kwargs):
'''在定义本节点处理逻辑时调用,在定义一个 instream 对象,把 function 封装为只接受一个参数的函数,用于处理前一处理节点的结果'''
return instream(lambda x: self.function(x, *args, **kwargs))
@instream
def readfile(filename):
with open(filename) as f:
for data in f:
yield data
@instream
def split(iteration, sep="\t"):
for data in iteration:
yield data.rstrip("\r\n").split(sep)
@instream
def concat(iteration, sep="\t", append="\n"):
for data in iteration:
yield sep.join((str(field) for field in data))+append
from collections import namedtuple
namedtuples = {}
def getnamedtuple(fieldnames):
tuplename = "_".join(fieldnames.strip().replace(",","_").split())
nt = namedtuples.get(tuplename)
if not nt:
nt = namedtuple(tuplename,fieldnames)
namedtuples[tuplename] = nt
return nt
@instream
def asname(iteration, fieldnames):
nt = getnamedtuple(fieldnames)
for data in iteration:
yield nt(*data)
from operator import attrgetter
@instream
def select(iteration, fieldnames):
nt = getnamedtuple(fieldnames)
fields = attrgetter(*[name.strip() for name in fieldnames.strip().split(",")])
for data in iteration:
yield nt(*fields(data))
from operator import itemgetter
@instream
def selectindex(iteration, *indexs):
fields = itemgetter(*indexs)
for data in iteration:
yield fields(data)
from itertools import ifilter
@instream
def where(iteration,condfunc):
def func(data):
return condfunc(*data)
return ifilter(func,iteration)
from itertools import ifilter
@instream
def wheredata(iteration,condfunc):
return ifilter(condfunc,iteration)
@instream
def group(iteration,keyfunc=None,valuefunc=None,aggfunc=None):
if not valuefunc:
valuefunc = lambda x:x
grp = groupby(iteration,keyfunc)
for k,vs in grp:
if aggfunc:
yield k,aggfunc(itertools.imap(valuefunc,vs))
else:
yield k,itertools.imap(valuefunc,vs)
@instream
def aggregate(iteration,aggfunc=None):
return aggfunc(iteration)
@instream
def limit(iteration, nlimit, start=0):
i = 0
end = start + nlimit
for data in iteration:
if i >= start and i < end:
yield data
i += 1
from collections import deque
@instream
def taillimit(iteration, nlimit, end=0):
buffer = deque()
length = end + nlimit
for data in iteration:
buffer.append(data)
if len(buffer) > length:
buffer.popleft()
i = 0
for data in buffer:
if i < nlimit:
yield data
i += 1
@instream
def map(iteration, func):
return itertools.imap(func, iteration)
@instream
def function(iteration,func):
return itertools.starmap(func, iteration)
@instream
def flatten(iteration,index,default=()):
for data in iteration:
if data[index] != None:
yield tuple(data[:index]) + tuple(data[index]) + tuple(data[index+1:])
else:
yield tuple(data[:index]) + tuple(default) + tuple(data[index+1:])
@instream
def lag(iteration, n=1):
datas = deque()
for data in iteration:
datas.append(data)
if len(datas) > n:
yield tuple(datas)
datas.popleft()
for i in range(len(datas)):
yield tuple(datas)
datas.popleft()
@instream
def join(iteration, otheriteration, cmpfunc=None):
iteration = iter(iteration)
otheriteration = iter(otheriteration)
living = True
otherliving = True
comp = 0
get = True
otherget = True
while True:
if get:
try:
data = iteration.next()
except StopIteration:
living = False
data = None
comp = 1
if otherget:
try:
otherdata = otheriteration.next()
except StopIteration:
otherliving = False
otherdata = None
comp = -1
if living and otherliving:
comp = cmpfunc(data, otherdata) if cmpfunc else 0
elif not living and not otherliving:
break
if comp < 0:
yield data,None
get, otherget = True, False
elif comp > 0:
yield None,otherdata
get, otherget = False, True
else:
yield data,otherdata
get, otherget = True, True
from operator import attrgetter
def onkey(firstname, secondname):
firstkey = attrgetter(*[name.strip() for name in firstname.strip().split(",")])
secondkey = attrgetter(*[name.strip() for name in secondname.strip().split(",")])
def compare(x,y):
return cmp(firstkey(x),secondkey(y))
return compare
from operator import itemgetter
def onindex(firstindexs, secondindexs):
firstkey = itemgetter(*firstindexs)
secondkey = itemgetter(*secondindexs)
def compare(x,y):
return cmp(firstkey(x),secondkey(y))
return compare
def getattr(obj, attrname):
try:
return attrgetter(attrname)(obj)
except:
return None
def getitem(obj, index):
try:
return itemgetter(index)(obj)
except:
return None
@instream
def foreach(iteration, func=None, initfunc=None, finalfunc=None):
context = ()
if initfunc:
context = initfunc()
for data in iteration:
if func:
func(data,*context)
if finalfunc:
finalfunc(*context)
from itertools import tee
@instream
def duplicate(iteration, n=2):
return tee(iteration, n)
@instream
def union(iteration, *otheriterations):
return chain(iteration, *otheriterations)
def printf(data):
print data,
def println(data):
print data