在数据量大的情况下,使用单reducer进行全局排序的方式明显效率较低,可次采用多reducer的方式。
map_sort.py
在map中进行分桶,分桶方式自定。
#! /usr/bin/python
import sys
base_count = 10000
try:
for line in sys.stdin:
ss = line.strip().split('\t')
key = ss[0]
val = ss[1]
new_key = base_count + int(key)
partition_id = 1
if new_key <= (10000+10100) / 2:
partition_id = 0
print "%s\t%s\t%s" %(partition_id, new_key, val)
except Exception:
print "map error"
red_sort.py
#! /usr/bin/python
import sys
try:
for line in sys.stdin:
partition_id, key, val = line.strip().split('\t')
print '\t'.join([key,val])
except Exception:
print "reduce error"