#!/usr/bin/python
#coding:utf-8
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
import itertools
import functools
import operator
import time
'''
处理流程模仿 shell 流处理 各个处理环节用 | > ** 符号连接(**表示输出流)
函数名模仿 sql 语句
主要函数:
fromfile(filename) #从文件读入
fromin() #从标准输入读入数据
output() #输出到标准输出,可指定输出(先用concat转化为字符串,再用printout打印出来)
tofile(filename) #输出到文件
select(fieldindex) #选择需要的几个字段
where(condfunc) #选择符合条件的记录,条件用函数计算
orderby(fieldindex,reverse) #根据keyindex排序,reverse表示是否逆序
groupby(fieldindex,(redfunc,paraindex,initvalue),...) #按字段分组,对每组用redfunc计算聚合值,数据需排好序
ordergroupby(fieldindex,(redfunc,paraindex,initvalue),...) #按字段排序分组,对每组用redfunc计算聚合值
join(tojoinindex,otherstream,keyindex,valueindex) #在内存缓存otherstream数据,对每条记录的tojoinnames字段与缓存数据的keynames字段匹配在join,获得valuenames字段的值
limit(start,end,step) #限制输出记录数
calculate(func,paraindex) #用func函数生成新的计算结果,可用paraindex指定输入字段,在原来基础上增加字段
update(func) #用func函数更新数据,传入参数为整条记录,返回处理后整条记录
getcolindexs(indexdef) #从文件或list初始化字段名称索引对应的对象
colindex #定义字段名称索引对应关系的对象
| #连接两个处理,前面处理完成后做后面处理(后面主动读取前面数据处理)
** #连接两个处理,前面处理完成后做后面处理(前面向后推数据,后面被动接受处理)
> #连接前面 | 的一系列处理和后面 ** 的一系列处理,执行整个处理过程
其他函数:
split() #把字符串分隔为多个字段,缺省分隔符\t
order(keyfunc) #根据函数求值排序
group(keyfunc,(redfunc,valuefunc,initfunc),...) #根据函数求值分组,对每组用redfunc计算聚合值
stepjoin(otherstream,...,dealfunc=lambda datas:datas,getnext=lambda datas:range(len(datas))) #otherstream的数据是排序的,流入数据也是排序的,dealfunc用各流中的数据生成处理后的结果数据,getnext根据流中的数据判断下一次读那些流的数据
transform(func,paraindex) #用func函数对数据做转换,可用paraindex指定输入字段
lag() #可以根据本条记录、上一条记录或上几条记录计算,如访问时长
flatten(index) #某个字段的值为数组,将数组下面的值展开成为直接字段值(记录数不变)
explode() #数据为数组,把数组内的每个元素展开为一条记录(可能一条记录生成多条记录)
concat() #把各字段用分隔符连接为字符串,缺省分隔符\t
printout() #输出到标准输出,可指定输出
log() #数据输出到标准错误输出或指定输出,同时还流向后续处理
duplicate(outstream,...) #复制流数据到多个输出流
flow() #同 > flow为全局函数,把>号前的处理为一个参数,>号后的处理为第二个处理,第二个参数可以为空
run() #启动流处理,如果没有用通过>或flow调用,未启动前只是 streamdealer 对象,启动后才能输入、输出数据
例如:
userlog.txt(字段含义:user,siteid,dates)
97F3A7879AB1 3 2014-10-30 12:00:00.104
97F3A7879AB1 2 2014-10-30 12:01:00.105
492DBD88435A 2 2014-10-30 12:02:00.120
97F3A7879AB1 1 2014-10-30 12:03:00.132
492DBD88435A 2 2014-10-30 12:04:00.139
2DAA98D1DAD5 1 2014-10-30 12:05:00.130
7091DA9357E8 1 2014-10-30 12:06:00.140
881406FC0565 3 2014-10-30 12:07:00.151
7091DA9357E8 1 2014-10-30 12:08:00.155
7091DA9357E8 3 2014-10-30 12:09:00.179
userage.txt(字段含义:user,age)
97F3A7879AB1 10
492DBD88435A 12
881406FC0565 12
7091DA9357E8 10
colindexs.txt
userlog user,siteid,dates
userage user,age
简单用户PV统计:
python stream.py 'cols=getcolindexs("colindexs.txt")' '(
fromfile("userlog.txt")
| ordergroupby(cols.userlog.index("siteid"),(countred,[],0))
> output()
)'
或者
python stream.py 'userlog=colindex("user,siteid,dates"); userage=colindex("user,age")' '(
fromfile("userlog.txt")
| join([userlog.user],fromfile("userage.txt"),[userage.user],[userage.age])
> output()
)'
注:
fromfile 从 userlog.txt 文件中读数据
ordergroupby 对数据用字段索引1排序、分组,聚合函数用 countred 不需要特殊参数,初始值为 0
concat() 把记录每个字段转化为str,再按\t连接,做此格式化后方便输出
> 执行整个过程,输出到后面,用 output() 输出到屏幕
concat() > output() 可认为是比较常用的固定语法
最前、最后用()括起来保证中间 | > 符号不被回车分隔为多个语句
再用 单引号 '' 引起来保证 shell 不转义,中间用 "" 把字符串引起来
统计PV结果:
1 4
2 3
3 3
'''
class streamdealer:
def __init__(self, function):
'''在装饰函数定义时调用,把函数封装在 streamdealer 对象中'''
self.function = function
def __call__(self, *args, **kw):
'''在定义本节点处理逻辑时调用,重新封装在用于连接前后流处理的 streamdealer 对象中'''
return streamdealer(lambda prev,next: self.function(prev, next, *args, **kw))
def __ror__(self, prev):
'''用 | 连接两个处理时调用,右侧进行处理,左侧为 prev 参数,重新封装,执行时先执行前面的调用,再执行本处理'''
def func(p,n):
if prev and hasattr(prev, "run"):
thisprev = prev.run(p, None)
else:
thisprev = iter(prev)
return self.run(thisprev, None)
return streamdealer(func)
def __pow__(self, next):
'''用 ** 连接两个处理时调用,左侧进行处理,右侧为 next 参数,重新封装,执行时先执行后面的调用,再执行本处理'''
def func(p,n):
if next and hasattr(next, "run"):
thisnext = next.run(None, n, withnext=True)
else:
thisnext = next
generator = self.run(None, thisnext)
return generator
return streamdealer(func)
def __lt__(self,prev):
'''用 > 连接两个处理时调用,执行整个处理过程'''
flow(prev,self)
def __iter__(self):
'''作为迭代器时调用'''
return self.run()
def run(self, prev=None, next=None, withnext=False):
'''用 (...).run() 时调用,用于启动协程'''
generator = self.function(prev, next)
if withnext:
generator.next()
return generator
def flow(prev, next=None):
'''执行整个处理过程, prev 是用 | 符号连接的一些处理, next 是用 ** 连接的一些处理'''
if prev and hasattr(prev, "run"):
src = prev.run(None,None)
else:
src = iter(prev)
if next and hasattr(next, "run"):
trg = next.run(None,None, withnext=True)
else:
trg = next
for data in src:
if trg:
trg.send(data)
stop(trg)
def stop(*dealers):
for dealer in dealers:
try:
if dealer:
dealer.throw(StopIteration)
except StopIteration:
pass
from itertools import imap
def fromin(input=sys.stdin, sep="\t", ltrim="", rtrim="\r\n"):
input = iter(input)
if sep:
return input | split(sep=sep,ltrim=ltrim,rtrim=rtrim)
else:
return input
def fromfile(filename, sep="\t", ltrim="", rtrim="\r\n"):
with open(filename) as f:
for data in fromin(f, sep, ltrim, rtrim):
yield data
@streamdealer
def tofile(prev, next, filename, sep="\t", start="", end="\n", append=False):
opentype = "w" if not append else "a"
with open(filename,opentype) as f:
outer = output(f, sep, start, end).run(prev,next,withnext=True)
try:
while True:
if prev:
data = prev.next()
else:
data = yield
outer.send(data)
except StopIteration:
stop(next)
@streamdealer
def output(prev, next, out=sys.stdout, sep="\t", prefix="", suffix="\n"):
if prev:
return (concat(sep) | printout(out,prefix,suffix)).run(prev,next)
else:
return (concat(sep) ** printout(out,prefix,suffix)).run(prev,next)
@streamdealer
def null(prev, next):
try:
while True:
if prev:
data = prev.next()
else:
data = yield
except StopIteration:
stop(next)
@streamdealer
def split(prev, next, sep="\t", size=None, ltrim="", rtrim="\r\n"):
try:
while True:
if prev:
data = prev.next()
else:
data = yield
data = data.lstrip(ltrim).rstrip(rtrim).split(sep)
if size:
data = data[:min(size,len(data))] + [None for i in range(size-len(data))]
if next:
next.send(data)
else:
yield data
except StopIteration:
stop(next)
@streamdealer
def concat(prev, next, sep="\t", prefix="", suffix=""):
try:
while True:
if prev:
data = prev.next()
else:
data = yield
data = prefix+sep.join((str(value) if value!=None else '' for value in data))+suffix
if next:
next.send(data)
else:
yield data
except StopIteration:
stop(next)
from operator import itemgetter
def myitemgetter(fieldindex):
if type(fieldindex) == int:
fieldindex = [fieldindex]
itemget = itemgetter(*fieldindex)
default = [None for i in fieldindex]
if len(fieldindex) == 1:
def getter(data):
try:
return [itemget(data)]
except:
print >>sys.stderr, 'error when itemget %s:%s' % (str(fieldindex),str(data))
return default
return getter
else:
def getter(data):
try:
return list(itemget(data))
except:
print >>sys.stderr, 'error when itemget %s:%s' % (str(fieldindex),str(data))
return default
return getter
@streamdealer
def select(prev, next, fieldindex):
getter = myitemgetter(fieldindex)
try:
while True:
if prev:
data = prev.next()
else:
data = yield
data = getter(data)
if next:
next.send(data)
else:
yield data
except StopIteration:
stop(next)
@streamdealer
def where(prev, next, condfunc):
try:
while True:
if prev:
data = prev.next()
else:
data = yield
if condfunc(data):
if next:
next.send(data)
else:
yield data
except StopIteration:
stop(next)
@streamdealer
def limit(prev, next, start, end, step=1):
i = 0
stepi = None
try:
while True:
if prev:
data = prev.next()
else:
data = yield
if (start==None or i>=start) and (end==None or i<end):
if step==None or stepi==None or stepi>=step:
if next:
next.send(data)
else:
yield data
stepi = 0
stepi += 1
i += 1
except StopIteration:
stop(next)
@streamdealer
def calculate(prev, next, calcfunc, paraindex=None, resultunpack=False):
if paraindex != None:
getter = myitemgetter(paraindex)
try:
while True:
if prev:
data = prev.next()
else:
data = yield
paras = (data,) if paraindex==None else getter(data) if len(paraindex)>1 else (getter(data),)
if resultunpack:
data = data + [calcfunc(*paras)]
else:
data = data + calcfunc(*paras)
if next:
next.send(data)
else:
yield data
except StopIteration:
stop(next)
@streamdealer
def update(prev, next, updatefunc):
try:
while True:
if prev:
data = prev.next()
else:
data = yield
updatefunc(data)
if next:
next.send(data)
else:
yield data
except StopIteration:
stop(next)
@streamdealer
def transform(prev, next, transfunc, paraindex=None):
try:
while True:
if prev:
data = prev.next()
else:
data = yield
paras = (data,) if paraindex==None else getter(data) if len(paraindex)>1 else (getter(data),)
data = transfunc(*paras)
if next:
next.send(data)
else:
yield data
except StopIteration:
stop(next)
from collections import deque
@streamdealer
def lag(prev, next, n=1):
datas = deque()
try:
while True:
if prev:
data = prev.next()
else:
data = yield
datas.append(data)
if len(datas) > n:
thedata = list(datas.popleft())
if next:
next.send( thedata + list(datas) )
else:
yield nt( thedata + list(datas) )
except StopIteration:
pass
try:
while datas:
thedata = list(datas.popleft())
leftlen = len(datas)
if next:
next.send( thedata + list(datas) + [None for i in range(n-leftlen)] )
else:
yield thedata + list(datas) + [None for i in range(n-leftlen)]
except StopIteration:
stop(next)
@streamdealer
def orderlag(prev, next, keyindex=None, n=1):
if prev:
return (orderby(keyindex) | lag(n)).run(prev,next)
else:
return (orderby(keyindex) ** lag(n)).run(prev,next)
@streamdealer
def duplicate(prev, next, *others, **kw):
start = kw.get("start", True)
if start:
newothers = []
for other in others:
if hasattr(other, "run"):
generator = other.run(None,None, withnext=True)
else:
generator = other
newothers.append(generator)
others = newothers
try:
while True:
if prev:
data = prev.next()
else:
data = yield
if next:
next.send(data)
else:
yield data
for other in others:
other.send(data)
except StopIteration:
stop(next)
@streamdealer
def printout(prev, next, out=sys.stdout, prefix="", suffix="\n"):
try:
while True:
if prev:
data = prev.next()
else:
data = yield
print >>out, prefix+str(data)+suffix,
except StopIteration:
stop(next)
@streamdealer
def log(prev, next, out=sys.stderr, prefix="", suffix="\n"):
try:
while True:
if prev:
data = prev.next()
else:
data = yield
print >>out, prefix+str(data)+suffix,
if next:
next.send(data)
else:
yield data
except StopIteration:
stop(next)
@streamdealer
def order(prev, next, keyfunc=lambda x:x, cmpfunc=None, reverse=False):
buf = []
try:
while True:
if prev:
data = prev.next()
else:
data = yield
buf.append(data)
except StopIteration:
for data in sorted(buf, cmpfunc, keyfunc, reverse):
if next:
next.send(data)
else:
yield data
stop(next)
def orderby(fieldindex, reverse=False):
if fieldindex:
return order(myitemgetter(fieldindex), None, reverse)
else:
return order(None, None, reverse)
def countred(counter, *datas):
return counter+len(datas)
def sumred(summer, *values):
try:
return summer+sum([sum(value) for value in values])
except:
return summer+sum([sum([float(v) for v in value]) for value in values])
@streamdealer
def group(prev, next, keyfunc=lambda x:None, *args):
redfuncs = []
valuefuncs = []
initfuncs = []
for fff in args:
if type(fff) == tuple or type(fff) == list:
redfuncs.append(fff[0])
valuefuncs.append(fff[1] if len(fff)>1 else None)
initfuncs.append(fff[2] if len(fff)>2 else None)
else:
redfuncs.append(fff)
valuefuncs.append(None)
initfuncs.append(None)
haslastkey = False
lastkey = None
result = [initfunc() if initfunc else None for initfunc in initfuncs]
try:
while True:
if prev:
data = prev.next()
else:
data = yield
key = keyfunc(data)
values = [valuefunc(data) if valuefunc else data for valuefunc in valuefuncs]
if haslastkey and key == lastkey:
result = [redfuncs[i](result[i],values[i]) for i in range(len(redfuncs))]
else:
if haslastkey:
if next:
next.send([lastkey]+result)
else:
yield [lastkey]+result
else:
haslastkey = True
lastkey = key
result = [initfunc() if initfunc else None for initfunc in initfuncs]
result = [redfuncs[i](result[i],values[i]) if initfuncs[i]!=None else values[i] for i in range(len(redfuncs))]
except StopIteration:
if haslastkey:
if next:
next.send([lastkey]+result)
else:
yield [lastkey]+result
stop(next)
@streamdealer
def flatten(prev, next, index):
try:
while True:
if prev:
data = prev.next()
else:
data = yield
value = data[index]
data = data[:index] + value + data[index+1:]
if next:
next.send(data)
else:
yield data
except StopIteration:
stop(next)
@streamdealer
def explode(prev, next, fieldnames=None):
try:
while True:
if prev:
datas = prev.next()
else:
datas = yield
for data in datas:
if next:
next.send(data)
else:
yield data
except StopIteration:
stop(next)
@streamdealer
def groupby(prev, next, fieldindex, *args):
keyfunc = myitemgetter(fieldindex)
funcargs = []
for redparas in args:
if type(redparas) == tuple or type(redparas) == list:
if len(redparas) == 3:
redfunc = redparas[0]
valuefunc = myitemgetter(redparas[1]) if redparas[1] else None
initvalue = redparas[2]
initfunc = lambda : initvalue
funcargs.append((redfunc,valuefunc,initfunc))
elif len(redparas) == 2:
redfunc = redparas[0]
valuefunc = myitemgetter(redparas[1]) if redparas[1] else None
funcargs.append((redfunc,valuefunc))
elif len(redparas) == 1:
redfunc = redparas[0]
funcargs.append((redfunc,))
else:
raise Exception('groupby parameter invalid : '+str(args))
else:
funcargs.append((redparas,))
#if type(fieldindex) != int and len(fieldindex) > 1:
if prev:
return (group(keyfunc, *funcargs) | flatten(0)).run(prev,next)
else:
return (group(keyfunc, *funcargs) ** flatten(0)).run(prev,next)
#else:
# return group(keyfunc, *funcargs).run(prev,next)
@streamdealer
def ordergroupby(prev, next, fieldindex, *args):
if prev:
return (orderby(fieldindex) | groupby(fieldindex,*args)).run(prev,next)
else:
return (orderby(fieldindex) ** groupby(fieldindex,*args)).run(prev,next)
@streamdealer
def join(prev, next, tojoinindex, otherstream, keyindex, valueindex, default=None):
keygetter = myitemgetter(keyindex)
valuegetter = myitemgetter(valueindex)
kvbuf = {}
for data in otherstream:
key = tuple(keygetter(data))
value = valuegetter(data)
if key in kvbuf:
kvbuf[key].append(value)
else:
kvbuf[key] = [value]
getter = myitemgetter(tojoinindex)
if default == None:
default = (None if type(valueindex)==int or len(valueindex)<=1 else [None for i in valueindex]) if valueindex else None
try:
while True:
if prev:
data = prev.next()
else:
data = yield
tojoin = tuple(getter(data))
values = kvbuf.get(tojoin,[default])
for value in values:
#if type(valueindex)==int or len(valueindex) > 1:
# result = data + [value]
#else:
result = data + ( list(value) if value else [None] )
if next:
next.send(result)
else:
yield result
except StopIteration:
stop(next)
@streamdealer
def stepjoin(prev, next, *otherstreams, **kw):
otherstreams = [iter(s) for s in otherstreams]
dealfunc = kw.get("dealfunc",None)
getnext = kw.get("getnext",lambda datas:range(len(datas)))
datas = [ () for i in range(len(otherstreams)+1) ]
livings = set(range(len(otherstreams)+1))
togets = list(range(len(otherstreams)+1))
lengths = []
while True:
for toget in togets:
if toget not in livings:
continue
try:
if toget==0:
if prev:
datas[toget] = prev.next()
else:
datas[toget] = yield
else:
datas[toget] = otherstreams[toget-1].next()
except StopIteration:
livings.remove(toget)
datas[toget] = None
if not lengths:
lengths = [len(data) if data else 0 for data in datas]
if not livings:
break
if dealfunc:
result = dealfunc(*datas)
else:
result = []
for i,data in enumerate(datas):
length = lengths[i]
if not data:
data = [None for i in range(length)]
elif len(data) != lengths[i]:
print >>sys.stderr, 'warning! field number is', len(data), 'not', length, ':', data, __file__, sys._getframe().f_lineno
data = data[:length] + [None for i in range(length-len(data))]
result += data
togets = getnext(datas)
if type(togets) == int:
togets = [togets]
if next:
next.send(result)
else:
yield result
stop(next)
def printf(*data,**kw):
out = kw.get('out',sys.stdout)
sep = kw.get('sep',' ')
print >>out, sep.join([str(value) for value in data]),
class colindex(object):
'''通过字段名称获得索引号的类'''
legalcharacter = set('abcdefghijklmnopqrstuvwxyz_ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789')
def __init__(self,*colnames):
self.colnames = ",".join(colnames).split(",")
try:
self.colnames = colindex.legalcolnames(self.colnames)
except:
pass
indexs = range(len(self.colnames))
self.nameindexs = dict(zip(self.colnames,indexs))
def __getattr__(self,name):
return self.nameindexs[name]
def index(self,*colnames):
return [self.nameindexs[colname] for colname in ",".join(colnames).split(",")]
def __add__(self,colnames):
colnames = colnames.split(",") if type(colnames) == str else colnames.colnames
return colindex(*(self.colnames+colnames))
def __sub__(self,colnames):
colnamesset = set(colnames.split(",") if type(colnames) == str else colnames.colnames)
newcolnames = []
for colname in self.colnames:
if colname not in colnamesset:
newcolnames.append(colname)
return colindex(*newcolnames)
def replace(self,oldnames,newnames):
oldnames = oldnames.split(",") if type(oldnames) == str else oldnames.colnames
oldnamesset = set(oldnames)
newnames = newnames.split(",") if type(newnames) == str else newnames.colnames
newcolnames = []
for colname in self.colnames:
if colname not in oldnamesset:
newcolnames.append(colname)
else:
newcolnames.append(newnames[oldnames.index(colname)])
return colindex(*newcolnames)
@classmethod
def legalname(cls,name):
if not name:
return 'f_'
name = ''.join(map(lambda x:x if x in cls.legalcharacter else '_',name))
while "__" in name:
name = name.replace("__","_")
if name[0] >= '0' and name[0] <= '9':
return 'n_'+name
elif name[0] == '_':
return 'u'+name
else:
return name
@classmethod
def legalcolnames(cls,newnames, usednames=[]):
usednames = set(usednames)
usednames.add('f_')
legalnames = []
for name in newnames:
name = cls.legalname(name)
suffix, i = '', 0
while name+suffix in usednames:
i += 1
suffix = '_'+str(i) if name[-1]!='_' else str(i)
usednames.add( name+suffix )
legalnames.append( name+suffix )
return legalnames
class dynamic(object):
pass
def getcolindexs(defs, sep="\t"):
'''从文件或list初始化colindex对象'''
indexs = dynamic()
iterdef = open(defs) if type(defs) == str else iter(defs)
for infos in iterdef:
if type(infos) == str:
infos = infos.rstrip("\r\n").split(sep)
if len(infos) > 1: # infos should be varname fieldnames
setattr( indexs, infos[0], colindex(*infos[1:]) )
if type(defs) == str:
iterdef.close()
return indexs
def main(args):
for arg in args:
exec(arg.strip())
if __name__ == '__main__':
main(sys.argv[1:])