python instream

让python程序中每个数据处理过程专注自己的逻辑,尽量不受输入输出的影响。参考了pipe包。

#!/usr/bin/python
#coding:utf-8

import sys
import itertools

'''
    处理流程模仿 shell 流处理 各个处理环节用 | 符号连接
    函数名模仿 sql 语句 (数据处理有顺序相关的缺省认为已排序,未排序的需先用sort函数排序)
    例如:
        nameage = ['zs\t10','ls\t12','ww\t13'] | split()
        nameweight = ['zs\t11.1','ls\t12.2','ww\t13.3'] | split()
        ( nameage | join(nameweight,onindex([0],[0])) |
            flatten(1,(None,None)) | flatten(0,(None,None)) |
            selectindex(0,1,3) |
            wheredata(lambda data:data[0]!=None) |
            concat() |
            foreach(printf)
        )
    结果:
        zs      10      11.1
        ls      12      12.2
        ww      13      13.3
    可以定义字段名,结果相同:
        nameage = ['zs\t10','ls\t12','ww\t13'] | split() | asname('name,age')
        nameweight = ['zs\t11.1','ls\t12.2','ww\t13.3'] | split() | asname('name,weight')
        ( nameage | join(nameweight,onkey('name','name')) | asname('na,nw') |
            function(lambda na,nw:(getattr(na,'name'),getattr(na,'age'),getattr(nw,'name'),getattr(nw,'weight'))) |
            asname('name,age,nwname,weight') |
            select('name,age,weight') |
            where(lambda name,age,weight:name!=None) |
            concat() |
            foreach(printf)
        )
'''


class instream:
    def __init__(self, function):
        '''在装饰函数定义时调用,把函数封装在 instream 对象中'''
        self.function = function
    def __ror__(self, preiteration):
        return self.function(preiteration)
    def __call__(self, *args, **kwargs):
        '''在定义本节点处理逻辑时调用,在定义一个 instream 对象,把 function 封装为只接受一个参数的函数,用于处理前一处理节点的结果'''
        return instream(lambda x: self.function(x, *args, **kwargs))

@instream
def readfile(filename):
    with open(filename) as f:
        for data in f:
            yield data

@instream
def split(iteration, sep="\t"):
    for data in iteration:
        yield data.rstrip("\r\n").split(sep)

@instream
def concat(iteration, sep="\t", append="\n"):
    for data in iteration:
        yield sep.join((str(field) for field in data))+append

from collections import namedtuple
namedtuples = {}
def getnamedtuple(fieldnames):
    tuplename = "_".join(fieldnames.strip().replace(",","_").split())
    nt = namedtuples.get(tuplename)
    if not nt:
        nt = namedtuple(tuplename,fieldnames)
        namedtuples[tuplename] = nt
    return nt

@instream
def asname(iteration, fieldnames):
    nt = getnamedtuple(fieldnames)
    for data in iteration:
        yield nt(*data)

from operator import attrgetter
@instream
def select(iteration, fieldnames):
    nt = getnamedtuple(fieldnames)
    fields = attrgetter(*[name.strip() for name in fieldnames.strip().split(",")])
    for data in iteration:
        yield nt(*fields(data))

from operator import itemgetter
@instream
def selectindex(iteration, *indexs):
    fields = itemgetter(*indexs)
    for data in iteration:
        yield fields(data)

from itertools import ifilter
@instream
def where(iteration,condfunc):
    def func(data):
        return condfunc(*data)
    return ifilter(func,iteration)

from itertools import ifilter
@instream
def wheredata(iteration,condfunc):
    return ifilter(condfunc,iteration)

@instream
def group(iteration,keyfunc=None,valuefunc=None,aggfunc=None):
    if not valuefunc:
        valuefunc = lambda x:x
    grp = groupby(iteration,keyfunc)
    for k,vs in grp:
        if aggfunc:
            yield k,aggfunc(itertools.imap(valuefunc,vs))
        else:
            yield k,itertools.imap(valuefunc,vs)

@instream
def aggregate(iteration,aggfunc=None):
    return aggfunc(iteration)

@instream
def limit(iteration, nlimit, start=0):
    i = 0
    end = start + nlimit
    for data in iteration:
        if i >= start and i < end:
            yield data
        i += 1

from collections import deque
@instream
def taillimit(iteration, nlimit, end=0):
    buffer = deque()
    length = end + nlimit
    for data in iteration:
        buffer.append(data)
        if len(buffer) > length:
            buffer.popleft()
    i = 0
    for data in buffer:
        if i < nlimit:
            yield data
        i += 1

@instream
def map(iteration, func):
    return itertools.imap(func, iteration)

@instream
def function(iteration,func):
    return itertools.starmap(func, iteration)

@instream
def flatten(iteration,index,default=()):
    for data in iteration:
        if data[index] != None:
            yield tuple(data[:index]) + tuple(data[index]) + tuple(data[index+1:])
        else:
            yield tuple(data[:index]) + tuple(default) + tuple(data[index+1:])

@instream
def lag(iteration, n=1):
    datas = deque()
    for data in iteration:
        datas.append(data)
        if len(datas) > n:
            yield tuple(datas)
            datas.popleft()
    for i in range(len(datas)):
        yield tuple(datas)
        datas.popleft()

@instream
def join(iteration, otheriteration, cmpfunc=None):
    iteration = iter(iteration)
    otheriteration = iter(otheriteration)
    living = True
    otherliving = True
    comp = 0
    get = True
    otherget = True
    while True:
        if get:
            try:
                data = iteration.next()
            except StopIteration:
                living = False
                data = None
                comp = 1
        if otherget:
            try:
                otherdata = otheriteration.next()
            except StopIteration:
                otherliving = False
                otherdata = None
                comp = -1
        if living and otherliving:
            comp = cmpfunc(data, otherdata) if cmpfunc else 0
        elif not living and not otherliving:
            break
        if comp < 0:
            yield data,None
            get, otherget = True, False
        elif comp > 0:
            yield None,otherdata
            get, otherget = False, True
        else:
            yield data,otherdata
            get, otherget = True, True

from operator import attrgetter
def onkey(firstname, secondname):
    firstkey = attrgetter(*[name.strip() for name in firstname.strip().split(",")])
    secondkey = attrgetter(*[name.strip() for name in secondname.strip().split(",")])
    def compare(x,y):
        return cmp(firstkey(x),secondkey(y))
    return compare

from operator import itemgetter
def onindex(firstindexs, secondindexs):
    firstkey = itemgetter(*firstindexs)
    secondkey = itemgetter(*secondindexs)
    def compare(x,y):
        return cmp(firstkey(x),secondkey(y))
    return compare

def getattr(obj, attrname):
    try:
        return attrgetter(attrname)(obj)
    except:
        return None

def getitem(obj, index):
    try:
        return itemgetter(index)(obj)
    except:
        return None

@instream
def foreach(iteration, func=None, initfunc=None, finalfunc=None):
    context = ()
    if initfunc:
        context = initfunc()
    for data in iteration:
        if func:
            func(data,*context)
    if finalfunc:
        finalfunc(*context)

from itertools import tee
@instream
def duplicate(iteration, n=2):
    return tee(iteration, n)

@instream
def union(iteration, *otheriterations):
    return chain(iteration, *otheriterations)

def printf(data):
    print data,

def println(data):
    print data


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值