在以前的一篇关于python版MR实例的文章中,只是简单走了一下流程,这次主要是解决上次遗留的key,partition,sort的问题。以进一步理解hadoop streaming,也尝试mapper使用python而reducer使用bash的结合方式
1. 省略本地测试这个环节,可参考以前的那篇文章,HDFS上的测试数据位:
[root@hadoop Desktop]# hadoop fs -cat /usr/egencia/travler/travler.txt
/usr/hadoop/hadoop-1.2.1/libexec/../conf/hadoop-env.sh: line 59: export: `mapred.tasktracker.reduce.tasks.maximum=4': not a valid identifier
Warning: $HADOOP_HOME is deprecated.
air:343;hotel:45;train:54467;nation:china
air:367;hotel:456;train:5567;nation:china
air:356;hotel:4522;train:54367;car:454;nation:china
air:343;hotel:45;train:54467;nation:usa
air:367;hotel:456;train:5567;nation:usa
air:356;hotel:4522;train:54367;car:454;nation:usa
air:343;hotel:45;train:54467;nation:india
air:367;hotel:456;train:5567;nation:india
air:356;hotel:4522;train:54367;car:454;nation:india
2.假定需求为:
2.1 每个国家一个输出文件
2.2 输出文件格式为:
air:343:china
hotel:45:china
........
2.3 每个输出文件按照第一个字段排序后按照第二个字段排序
3 设计
key:第一和第三个字段
mapper输出为:air:343:china
partition:第三个字段
sort:第一和第二字段
4.mapper(python):
[root@hadoop Desktop]# cat tmapper.py
#!/usr/bin/python
import sys
line=sys.stdin.readline()
#print line[-1]
try:
while line:
line=line[:-1]
#print line
products=line.split(";")
nations=products[-1].split(":")
nation=nations[-1]
prolen=len(products)
for index,pro in enumerate(products):
if index==(prolen-1):
break
else:
subs=pro.split(":")
print subs[0]+":"+subs[-1]+":"+nation
line=sys.stdin.readline()
except :
print "error"
5.reducer(bash):
/bin/cat
6.先不执行分区和排序:
[root@hadoop Desktop]# hadoop jar /usr/hadoop/hadoop-1.2.1/contrib/streaming/hadoop-streaming-1.2.1.jar \ > -D mapred.reduce.tasks=3 \
> -input /usr/egencia/travler \
> -output /usr/egencia/travler/out \
> -mapper tmapper.py \
> -reducer /bin/cat \
> -file tmapper.py
查看输出文件:
[root@hadoop Desktop]# hadoop fs -ls /usr/egencia/travler/out
/usr/hadoop/hadoop-1.2.1/libexec/../conf/hadoop-env.sh: line 59: export: `mapred.tasktracker.reduce.tasks.maximum=4': not a valid identifier
Warning: $HADOOP_HOME is deprecated.
Found 5 items
-rw-r--r-- 1 root supergroup 0 2013-09-06 02:42 /usr/egencia/travler/out/_SUCCESS
drwxr-xr-x - root supergroup 0 2013-09-06 02:42 /usr/egencia/travler/out/_logs
-rw-r--r-- 1 root supergroup 160 2013-09-06 02:42 /usr/egencia/travler/out/part-00000
-rw-r--r-- 1 root supergroup 189 2013-09-06 02:42 /usr/egencia/travler/out/part-00001
-rw-r--r-- 1 root supergroup 132 2013-09-06 02:42 /usr/egencia/travler/out/part-00002
[root@hadoop Desktop]# hadoop fs -cat /usr/egencia/travler/out/part-00000 /usr/egencia/travler/out/part-00001 \
> /usr/egencia/travler/out/part-00002
/usr/hadoop/hadoop-1.2.1/libexec/../conf/hadoop-env.sh: line 59: export: `mapred.tasktracker.reduce.tasks.maximum=4': not a valid identifier
Warning: $HADOOP_HOME is deprecated.
air:343:usa
air:367:usa
car:454:india
hotel:4522:china
hotel:4522:usa
hotel:456:india
hotel:45:usa
train:54367:china
train:54367:usa
train:5567:india
air:343:india
air:356:india
air:356:usa
air:367:india
car:454:china
car:454:usa
hotel:456:china
hotel:45:india
train:54467:india
train:54467:usa
train:5567:china
train:5567:usa
air:343:china
air:356:china
air:367:china
hotel:4522:india
hotel:456:usa
hotel:45:china
train:54367:india
train:54467:china
7. 删除输出目录后,
指定分割符号为:
指定第一和第三个字段为key
指定分区为第三个字段
重新执行
第一种执行:
hadoop jar /usr/hadoop/hadoop-1.2.1/contrib/streaming/hadoop-streaming-1.2.1.jar -D mapred.reduce.tasks=3 -D stream.map.output.field.separator=: -D tream.num.map.output.key.fields=1 -input /usr/egencia/travler -output /usr/egencia/travler/out -mapper tmapper.py -reducer /bin/cat -file tmapper.py
结果
Warning: $HADOOP_HOME is deprecated.
hotel 45:china
hotel 456:china
hotel 4522:china
hotel 45:usa
hotel 456:usa
hotel 4522:usa
每个文件都是按照产品进行了partition
hadoop jar /usr/hadoop/hadoop-1.2.1/contrib/streaming/hadoop-streaming-1.2.1.jar \
-D mapred.reduce.tasks=3 \
-D stream.map.output.field.separator=: \
-D tream.num.map.output.key.fields=1 \
-D map.output.key.field.separator=: \
-D mapred.text.key.partitioner.options=-k3 \
-input /usr/egencia/travler \
-output /usr/egencia/travler/out \
-mapper tmapper.py \
-reducer /bin/cat -file tmapper.py \
-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner
hadoop jar /usr/hadoop/hadoop-1.2.1/contrib/streaming/hadoop-streaming-1.2.1.jar \
-D stream.map.output.field.separator=: \
-D stream.num.map.output.key.fields=3 \
-D map.output.key.field.separator=: \
-D mapred.text.key.partitioner.options=-k2 \
-D mapred.reduce.tasks=2 \
-input /usr/egencia/travler \
-output /usr/egencia/travler/out \
-inputformat org.apache.hadoop.mapred.TextInputFormat \
-mapper tmapper.py \
-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
-file tmapper.py \
-outputformat org.apache.hadoop.mapred.TextOutputFormat \
-reducer /bin/cat
1. 省略本地测试这个环节,可参考以前的那篇文章,HDFS上的测试数据位:
[root@hadoop Desktop]# hadoop fs -cat /usr/egencia/travler/travler.txt
/usr/hadoop/hadoop-1.2.1/libexec/../conf/hadoop-env.sh: line 59: export: `mapred.tasktracker.reduce.tasks.maximum=4': not a valid identifier
Warning: $HADOOP_HOME is deprecated.
air:343;hotel:45;train:54467;nation:china
air:367;hotel:456;train:5567;nation:china
air:356;hotel:4522;train:54367;car:454;nation:china
air:343;hotel:45;train:54467;nation:usa
air:367;hotel:456;train:5567;nation:usa
air:356;hotel:4522;train:54367;car:454;nation:usa
air:343;hotel:45;train:54467;nation:india
air:367;hotel:456;train:5567;nation:india
air:356;hotel:4522;train:54367;car:454;nation:india
2.假定需求为:
2.1 每个国家一个输出文件
2.2 输出文件格式为:
air:343:china
hotel:45:china
........
2.3 每个输出文件按照第一个字段排序后按照第二个字段排序
3 设计
key:第一和第三个字段
mapper输出为:air:343:china
partition:第三个字段
sort:第一和第二字段
4.mapper(python):
[root@hadoop Desktop]# cat tmapper.py
#!/usr/bin/python
import sys
line=sys.stdin.readline()
#print line[-1]
try:
while line:
line=line[:-1]
#print line
products=line.split(";")
nations=products[-1].split(":")
nation=nations[-1]
prolen=len(products)
for index,pro in enumerate(products):
if index==(prolen-1):
break
else:
subs=pro.split(":")
print subs[0]+":"+subs[-1]+":"+nation
line=sys.stdin.readline()
except :
print "error"
5.reducer(bash):
/bin/cat
6.先不执行分区和排序:
[root@hadoop Desktop]# hadoop jar /usr/hadoop/hadoop-1.2.1/contrib/streaming/hadoop-streaming-1.2.1.jar \ > -D mapred.reduce.tasks=3 \
> -input /usr/egencia/travler \
> -output /usr/egencia/travler/out \
> -mapper tmapper.py \
> -reducer /bin/cat \
> -file tmapper.py
查看输出文件:
[root@hadoop Desktop]# hadoop fs -ls /usr/egencia/travler/out
/usr/hadoop/hadoop-1.2.1/libexec/../conf/hadoop-env.sh: line 59: export: `mapred.tasktracker.reduce.tasks.maximum=4': not a valid identifier
Warning: $HADOOP_HOME is deprecated.
Found 5 items
-rw-r--r-- 1 root supergroup 0 2013-09-06 02:42 /usr/egencia/travler/out/_SUCCESS
drwxr-xr-x - root supergroup 0 2013-09-06 02:42 /usr/egencia/travler/out/_logs
-rw-r--r-- 1 root supergroup 160 2013-09-06 02:42 /usr/egencia/travler/out/part-00000
-rw-r--r-- 1 root supergroup 189 2013-09-06 02:42 /usr/egencia/travler/out/part-00001
-rw-r--r-- 1 root supergroup 132 2013-09-06 02:42 /usr/egencia/travler/out/part-00002
[root@hadoop Desktop]# hadoop fs -cat /usr/egencia/travler/out/part-00000 /usr/egencia/travler/out/part-00001 \
> /usr/egencia/travler/out/part-00002
/usr/hadoop/hadoop-1.2.1/libexec/../conf/hadoop-env.sh: line 59: export: `mapred.tasktracker.reduce.tasks.maximum=4': not a valid identifier
Warning: $HADOOP_HOME is deprecated.
air:343:usa
air:367:usa
car:454:india
hotel:4522:china
hotel:4522:usa
hotel:456:india
hotel:45:usa
train:54367:china
train:54367:usa
train:5567:india
air:343:india
air:356:india
air:356:usa
air:367:india
car:454:china
car:454:usa
hotel:456:china
hotel:45:india
train:54467:india
train:54467:usa
train:5567:china
train:5567:usa
air:343:china
air:356:china
air:367:china
hotel:4522:india
hotel:456:usa
hotel:45:china
train:54367:india
train:54467:china
7. 删除输出目录后,
指定分割符号为:
指定第一和第三个字段为key
指定分区为第三个字段
重新执行
第一种执行:
hadoop jar /usr/hadoop/hadoop-1.2.1/contrib/streaming/hadoop-streaming-1.2.1.jar -D mapred.reduce.tasks=3 -D stream.map.output.field.separator=: -D tream.num.map.output.key.fields=1 -input /usr/egencia/travler -output /usr/egencia/travler/out -mapper tmapper.py -reducer /bin/cat -file tmapper.py
结果
Warning: $HADOOP_HOME is deprecated.
hotel 45:china
hotel 456:china
hotel 4522:china
hotel 45:usa
hotel 456:usa
hotel 4522:usa
每个文件都是按照产品进行了partition
hadoop jar /usr/hadoop/hadoop-1.2.1/contrib/streaming/hadoop-streaming-1.2.1.jar \
-D mapred.reduce.tasks=3 \
-D stream.map.output.field.separator=: \
-D tream.num.map.output.key.fields=1 \
-D map.output.key.field.separator=: \
-D mapred.text.key.partitioner.options=-k3 \
-input /usr/egencia/travler \
-output /usr/egencia/travler/out \
-mapper tmapper.py \
-reducer /bin/cat -file tmapper.py \
-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner
hadoop jar /usr/hadoop/hadoop-1.2.1/contrib/streaming/hadoop-streaming-1.2.1.jar \
-D stream.map.output.field.separator=: \
-D stream.num.map.output.key.fields=3 \
-D map.output.key.field.separator=: \
-D mapred.text.key.partitioner.options=-k2 \
-D mapred.reduce.tasks=2 \
-input /usr/egencia/travler \
-output /usr/egencia/travler/out \
-inputformat org.apache.hadoop.mapred.TextInputFormat \
-mapper tmapper.py \
-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
-file tmapper.py \
-outputformat org.apache.hadoop.mapred.TextOutputFormat \
-reducer /bin/cat