python MR 中reduce迭代

在java版本中,maprduce框架会自动把相同的key的value值传给reduce函数进行迭代,但是在stream中,MR框架虽然也对key进行排序,但是并不能够把相同key的value传给reduce函数进行迭代,必须用代码进行控制这样的迭代,这在对数据进行分段sum,account,average等汇总计算的时候是必须的,以python为例:

原始数据:

line product group issuedate ticket expense

8    60000    4965    2013-09-09    1    -78.45
8    60000    4965    2013-09-06    1    NULL
8    60000    4965    2013-09-06    1    67.20
8    60000    4995    2013-09-05    1    134.00
8    60000    4965    2013-09-06    1    67.20
8    60000    5011    2013-09-09    1    81.38


要做的是对line, group,issudate 进行分组,然后对ticket和expense进行sum计算,同时对line进行解释和把issuedate全部转换成当月的一号。


mapper其实就很简单,只是做简单的转换和数据过滤

mapper文件:.m.py

#!/usr/bin/python

import sys

from datetime import *

class mapper:

    def map(self):
        for line in sys.stdin:
            value=line.strip()
            arr=value.split('\t')
            lobname=''
            issuedate=''
            try:
                if len(arr)>=6:
                    lob=arr[0].strip()        
                    if lob in('1','2'):
                        lobname='Air'
                    else:
                        if lob in('3','4'):
                            lobname='Hotel'
                        else:
                            if lob in('5'):
                                lobname='Train'
                            else :
                                if lob in('6'):
                                    lobname='Meeting'
                                else:
                                    lobname='Unknown'
                    #get gpid
                    gpid=arr[2].strip()
                    #get issuedate
                    issuedate=datetime.strptime(arr[3].strip(),'%Y-%m-%d').strftime('%Y-%m')+'-01'      
                print lobname+'\t'+gpid+'\t'+issuedate+'\t'+arr[4].strip()+'\t'+arr[5].strip()
            except:
                pass


if __name__=='__main__':
    mymapper=mapper()
    mymapper.map()


在reducer中,我对相同key的迭代是通过判定当前行和上一行的key值进行对比实现的,这点需要时刻记住的是MR框架已经按照key进行了排序,所以这中方法是可行的,

reducer文件:r.py:

#!/usr/bin/python

import sys

class reducer:
    def reduce(self):
        inits={'lob':'UN','gpid':'UN','issuedate':'UN','ticket':0,'expense':0}
        for line in sys.stdin:
            try:
                #print line
                #print inits.get('lob'),inits.get('gpid'),inits.get('issuedate')
                arr=line.strip().split('\t')
                lob=arr[0]
                gpid=arr[1]
                issuedate=arr[2]
                ticket=arr[3]
                
                if ticket.lower()=='null':
                    ticket=0
                else :
                    ticket=int(ticket)
                expense=arr[4]
                if expense.lower()=='null':
                    expense=0
                else:
                    expense=float(expense)
                    
                if not(inits.get('lob')=='UN' and inits.get('gpid')=='UN' and inits.get('issuedate')=='UN'):

                    if inits.get('lob')==lob and inits.get('gpid')==gpid and inits.get('issuedate')==issuedate:
                        inits['ticket']=inits['ticket']+ticket
                        inits['expense']=inits['expense']+expense
                    else:
                        print inits.get('lob')+'\t'+inits.get('gpid')+'\t'+inits.get('issuedate')+'\t'+str(inits.get('ticket'))+'\t'+str(inits.get('expense'))
                        inits['lob']=lob
                        inits['gpid']=gpid
                        inits['issuedate']=issuedate
                        inits['ticket']=ticket
                        inits['expense']=expense
                        
                else:
                    inits['lob']=lob
                    inits['gpid']=gpid
                    inits['issuedate']=issuedate
                    inits['ticket']=ticket
                    inits['expense']=expense
            except:
                pass
                
         #print  the last row          
        print inits.get('lob')+'\t'+inits.get('gpid')+'\t'+inits.get('issuedate')+'\t'+str(inits.get('ticket'))+'\t'+str(inits.get('expense'))
                        
    
if __name__=='__main__':
    myreducer=reducer()
    myreducer.reduce()



执行:

hadoop jar /usr/hadoop/hadoop-1.2.1/contrib/streaming/hadoop-streaming-1.2.1.jar \
-D mapred.reduce.tasks=1 \
-D stream.num.map.output.key.fields=4 \
-input /rs2/businessdetail.txt \
-output /rs2/output2s/ \
-mapper m.py \
-reducer r.py \
-file m.py \
-file r.py;


为了验证上面的迭代结果中不会出现相同的key值,用sort和AWK进行验证,如果输出的结果集中包含了相同的key值则证明上面的迭代是错误的反之是成功的,

hadoop fs -cat /rs2/output2s/part-00000 | sort|awk -F'\t' '{m=n;n=$1$2$3;if (m==n){print n}}'

结果为0行说明是正确的,在原始数据中行数为130万


在python中另一种迭代方式也可以通过字典来实现。就是每一个map key作为一个字典的key,然后对字典值进行汇总计算,把上面的reduce函数改造一下:

#!/usr/bin/python

import sys

class reducer:
    def reduce(self):
        #inits={'lob':'UN','gpid':'UN','issuedate':'UN','ticket':0,'expense':0}
        inits={}
        for line in sys.stdin:
            try:
                #print line
                #print inits.get('lob'),inits.get('gpid'),inits.get('issuedate')
                arr=line.strip().split('\t')
                lob=arr[0]
                gpid=arr[1]
                issuedate=arr[2]
                ticket=arr[3]
                key=lob+'\t'+gpid+'\t'+issuedate
                if ticket.lower()=='null':
                    ticket=0
                else :
                    ticket=int(ticket)
                expense=arr[4]
                if expense.lower()=='null':
                    expense=0
                else:
                    expense=float(expense)

                if inits.get(key)==None :
                    inits[key]=[ticket,expense]
                else:
                    ori=inits.get(key)
                    ori[0]=ticket+ori[0]
                    ori[1]=expense+ori[1]
                    inits[key]=ori
                '''    
                if not(inits.get('lob')=='UN' and inits.get('gpid')=='UN' and inits.get('issuedate')=='UN'):

                    if inits.get('lob')==lob and inits.get('gpid')==gpid and inits.get('issuedate')==issuedate:
                        inits['ticket']=inits['ticket']+ticket
                        inits['expense']=inits['expense']+expense
                    else:
                        print inits.get('lob')+'\t'+inits.get('gpid')+'\t'+inits.get('issuedate')+'\t'+str(inits.get('ticket'))+'\t'+str(inits.get('expense'))
                        inits['lob']=lob
                        inits['gpid']=gpid
                        inits['issuedate']=issuedate
                        inits['ticket']=ticket
                        inits['expense']=expense
                        
                else:
                    inits['lob']=lob
                    inits['gpid']=gpid
                    inits['issuedate']=issuedate
                    inits['ticket']=ticket
                    inits['expense']=expense
                '''
            except:
                pass
                
        for key,values in inits.items():
            print key,values[0],values[1]
            
        #print inits.get('lob')+'\t'+inits.get('gpid')+'\t'+inits.get('issuedate')+'\t'+str(inits.get('ticket'))+'\t'+str(inits.get('expense'))
                        
    
if __name__=='__main__':
    myreducer=reducer()
    myreducer.reduce()


这样方法由于和字典遍历的特性是没有顺序的,所以输出结果中也是没有顺序的,如果需要结果是有序的可以把key放到一个列表中然后按照列表的顺序去取字典的值,改进如下:

#!/usr/bin/python

import sys

class reducer:
    def reduce(self):
        #inits={'lob':'UN','gpid':'UN','issuedate':'UN','ticket':0,'expense':0}
        inits={}
        keys=[]
        for line in sys.stdin:
            try:
                #print line
                #print inits.get('lob'),inits.get('gpid'),inits.get('issuedate')
                arr=line.strip().split('\t')
                lob=arr[0]
                gpid=arr[1]
                issuedate=arr[2]
                ticket=arr[3]
                key=lob+'\t'+gpid+'\t'+issuedate
                if ticket.lower()=='null':
                    ticket=0
                else :
                    ticket=int(ticket)
                expense=arr[4]
                if expense.lower()=='null':
                    expense=0
                else:
                    expense=float(expense)

                if inits.get(key)==None :
                    inits[key]=[ticket,expense]
                    keys.append(key)
                else:
                    
                    ori=inits.get(key)
                    ori[0]=ticket+ori[0]
                    ori[1]=expense+ori[1]
                    inits[key]=ori
                '''    
                if not(inits.get('lob')=='UN' and inits.get('gpid')=='UN' and inits.get('issuedate')=='UN'):

                    if inits.get('lob')==lob and inits.get('gpid')==gpid and inits.get('issuedate')==issuedate:
                        inits['ticket']=inits['ticket']+ticket
                        inits['expense']=inits['expense']+expense
                    else:
                        print inits.get('lob')+'\t'+inits.get('gpid')+'\t'+inits.get('issuedate')+'\t'+str(inits.get('ticket'))+'\t'+str(inits.get('expense'))
                        inits['lob']=lob
                        inits['gpid']=gpid
                        inits['issuedate']=issuedate
                        inits['ticket']=ticket
                        inits['expense']=expense
                        
                else:
                    inits['lob']=lob
                    inits['gpid']=gpid
                    inits['issuedate']=issuedate
                    inits['ticket']=ticket
                    inits['expense']=expense
                '''
            except:
                pass
                
        #for key,values in inits.items():
        #    print key,values[0],values[1]
        for ik in keys:
            print ik+'\t'+str(inits.get(ik)[0])+'\t'+str(inits.get(ik)[1])
            
        #print inits.get('lob')+'\t'+inits.get('gpid')+'\t'+inits.get('issuedate')+'\t'+str(inits.get('ticket'))+'\t'+str(inits.get('expense'))
                        
    
if __name__=='__main__':
    myreducer=reducer()
    myreducer.reduce()


当然也可以通过列表的方式实现,把key放在一个列表,把需要汇总的ticket和expense放在另外的两个列表,总共就有三个列表,只是需要注意3个列表的顺序要是绝对对应的,否则输出将会是错误的




  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值