在java版本中,maprduce框架会自动把相同的key的value值传给reduce函数进行迭代,但是在stream中,MR框架虽然也对key进行排序,但是并不能够把相同key的value传给reduce函数进行迭代,必须用代码进行控制这样的迭代,这在对数据进行分段sum,account,average等汇总计算的时候是必须的,以python为例:
原始数据:
line product group issuedate ticket expense
8 60000 4965 2013-09-09 1 -78.45
8 60000 4965 2013-09-06 1 NULL
8 60000 4965 2013-09-06 1 67.20
8 60000 4995 2013-09-05 1 134.00
8 60000 4965 2013-09-06 1 67.20
8 60000 5011 2013-09-09 1 81.38
要做的是对line, group,issudate 进行分组,然后对ticket和expense进行sum计算,同时对line进行解释和把issuedate全部转换成当月的一号。
mapper其实就很简单,只是做简单的转换和数据过滤
mapper文件:.m.py
#!/usr/bin/python
import sys
from datetime import *
class mapper:
def map(self):
for line in sys.stdin:
value=line.strip()
arr=value.split('\t')
lobname=''
issuedate=''
try:
if len(arr)>=6:
lob=arr[0].strip()
if lob in('1','2'):
lobname='Air'
else:
if lob in('3','4'):
lobname='Hotel'
else:
if lob in('5'):
lobname='Train'
else :
if lob in('6'):
lobname='Meeting'
else:
lobname='Unknown'
#get gpid
gpid=arr[2].strip()
#get issuedate
issuedate=datetime.strptime(arr[3].strip(),'%Y-%m-%d').strftime('%Y-%m')+'-01'
print lobname+'\t'+gpid+'\t'+issuedate+'\t'+arr[4].strip()+'\t'+arr[5].strip()
except:
pass
if __name__=='__main__':
mymapper=mapper()
mymapper.map()
在reducer中,我对相同key的迭代是通过判定当前行和上一行的key值进行对比实现的,这点需要时刻记住的是MR框架已经按照key进行了排序,所以这中方法是可行的,
reducer文件:r.py:
#!/usr/bin/python
import sys
class reducer:
def reduce(self):
inits={'lob':'UN','gpid':'UN','issuedate':'UN','ticket':0,'expense':0}
for line in sys.stdin:
try:
#print line
#print inits.get('lob'),inits.get('gpid'),inits.get('issuedate')
arr=line.strip().split('\t')
lob=arr[0]
gpid=arr[1]
issuedate=arr[2]
ticket=arr[3]
if ticket.lower()=='null':
ticket=0
else :
ticket=int(ticket)
expense=arr[4]
if expense.lower()=='null':
expense=0
else:
expense=float(expense)
if not(inits.get('lob')=='UN' and inits.get('gpid')=='UN' and inits.get('issuedate')=='UN'):
if inits.get('lob')==lob and inits.get('gpid')==gpid and inits.get('issuedate')==issuedate:
inits['ticket']=inits['ticket']+ticket
inits['expense']=inits['expense']+expense
else:
print inits.get('lob')+'\t'+inits.get('gpid')+'\t'+inits.get('issuedate')+'\t'+str(inits.get('ticket'))+'\t'+str(inits.get('expense'))
inits['lob']=lob
inits['gpid']=gpid
inits['issuedate']=issuedate
inits['ticket']=ticket
inits['expense']=expense
else:
inits['lob']=lob
inits['gpid']=gpid
inits['issuedate']=issuedate
inits['ticket']=ticket
inits['expense']=expense
except:
pass
#print the last row
print inits.get('lob')+'\t'+inits.get('gpid')+'\t'+inits.get('issuedate')+'\t'+str(inits.get('ticket'))+'\t'+str(inits.get('expense'))
if __name__=='__main__':
myreducer=reducer()
myreducer.reduce()
执行:
hadoop jar /usr/hadoop/hadoop-1.2.1/contrib/streaming/hadoop-streaming-1.2.1.jar \
-D mapred.reduce.tasks=1 \
-D stream.num.map.output.key.fields=4 \
-input /rs2/businessdetail.txt \
-output /rs2/output2s/ \
-mapper m.py \
-reducer r.py \
-file m.py \
-file r.py;
为了验证上面的迭代结果中不会出现相同的key值,用sort和AWK进行验证,如果输出的结果集中包含了相同的key值则证明上面的迭代是错误的反之是成功的,
hadoop fs -cat /rs2/output2s/part-00000 | sort|awk -F'\t' '{m=n;n=$1$2$3;if (m==n){print n}}'
结果为0行说明是正确的,在原始数据中行数为130万
在python中另一种迭代方式也可以通过字典来实现。就是每一个map key作为一个字典的key,然后对字典值进行汇总计算,把上面的reduce函数改造一下:
#!/usr/bin/python
import sys
class reducer:
def reduce(self):
#inits={'lob':'UN','gpid':'UN','issuedate':'UN','ticket':0,'expense':0}
inits={}
for line in sys.stdin:
try:
#print line
#print inits.get('lob'),inits.get('gpid'),inits.get('issuedate')
arr=line.strip().split('\t')
lob=arr[0]
gpid=arr[1]
issuedate=arr[2]
ticket=arr[3]
key=lob+'\t'+gpid+'\t'+issuedate
if ticket.lower()=='null':
ticket=0
else :
ticket=int(ticket)
expense=arr[4]
if expense.lower()=='null':
expense=0
else:
expense=float(expense)
if inits.get(key)==None :
inits[key]=[ticket,expense]
else:
ori=inits.get(key)
ori[0]=ticket+ori[0]
ori[1]=expense+ori[1]
inits[key]=ori
'''
if not(inits.get('lob')=='UN' and inits.get('gpid')=='UN' and inits.get('issuedate')=='UN'):
if inits.get('lob')==lob and inits.get('gpid')==gpid and inits.get('issuedate')==issuedate:
inits['ticket']=inits['ticket']+ticket
inits['expense']=inits['expense']+expense
else:
print inits.get('lob')+'\t'+inits.get('gpid')+'\t'+inits.get('issuedate')+'\t'+str(inits.get('ticket'))+'\t'+str(inits.get('expense'))
inits['lob']=lob
inits['gpid']=gpid
inits['issuedate']=issuedate
inits['ticket']=ticket
inits['expense']=expense
else:
inits['lob']=lob
inits['gpid']=gpid
inits['issuedate']=issuedate
inits['ticket']=ticket
inits['expense']=expense
'''
except:
pass
for key,values in inits.items():
print key,values[0],values[1]
#print inits.get('lob')+'\t'+inits.get('gpid')+'\t'+inits.get('issuedate')+'\t'+str(inits.get('ticket'))+'\t'+str(inits.get('expense'))
if __name__=='__main__':
myreducer=reducer()
myreducer.reduce()
这样方法由于和字典遍历的特性是没有顺序的,所以输出结果中也是没有顺序的,如果需要结果是有序的可以把key放到一个列表中然后按照列表的顺序去取字典的值,改进如下:
#!/usr/bin/python
import sys
class reducer:
def reduce(self):
#inits={'lob':'UN','gpid':'UN','issuedate':'UN','ticket':0,'expense':0}
inits={}
keys=[]
for line in sys.stdin:
try:
#print line
#print inits.get('lob'),inits.get('gpid'),inits.get('issuedate')
arr=line.strip().split('\t')
lob=arr[0]
gpid=arr[1]
issuedate=arr[2]
ticket=arr[3]
key=lob+'\t'+gpid+'\t'+issuedate
if ticket.lower()=='null':
ticket=0
else :
ticket=int(ticket)
expense=arr[4]
if expense.lower()=='null':
expense=0
else:
expense=float(expense)
if inits.get(key)==None :
inits[key]=[ticket,expense]
keys.append(key)
else:
ori=inits.get(key)
ori[0]=ticket+ori[0]
ori[1]=expense+ori[1]
inits[key]=ori
'''
if not(inits.get('lob')=='UN' and inits.get('gpid')=='UN' and inits.get('issuedate')=='UN'):
if inits.get('lob')==lob and inits.get('gpid')==gpid and inits.get('issuedate')==issuedate:
inits['ticket']=inits['ticket']+ticket
inits['expense']=inits['expense']+expense
else:
print inits.get('lob')+'\t'+inits.get('gpid')+'\t'+inits.get('issuedate')+'\t'+str(inits.get('ticket'))+'\t'+str(inits.get('expense'))
inits['lob']=lob
inits['gpid']=gpid
inits['issuedate']=issuedate
inits['ticket']=ticket
inits['expense']=expense
else:
inits['lob']=lob
inits['gpid']=gpid
inits['issuedate']=issuedate
inits['ticket']=ticket
inits['expense']=expense
'''
except:
pass
#for key,values in inits.items():
# print key,values[0],values[1]
for ik in keys:
print ik+'\t'+str(inits.get(ik)[0])+'\t'+str(inits.get(ik)[1])
#print inits.get('lob')+'\t'+inits.get('gpid')+'\t'+inits.get('issuedate')+'\t'+str(inits.get('ticket'))+'\t'+str(inits.get('expense'))
if __name__=='__main__':
myreducer=reducer()
myreducer.reduce()
当然也可以通过列表的方式实现,把key放在一个列表,把需要汇总的ticket和expense放在另外的两个列表,总共就有三个列表,只是需要注意3个列表的顺序要是绝对对应的,否则输出将会是错误的