Python+Hadoop Streaming实现MapReduce（word count）_python total_count = sum(int(count) for current

MapReduce示例：WordCount（Java原生）

WordCount是hadoop最经典的一个词频统计方法，它很好的体现了MapReducede分合的思想，在集群中该方法的触发指令为：

$hadoop jar xxx/xxx/wordcount.jarwordcount input_path output_path 其中：

·wordcount：为触发的方法名称

·input_path：指定所要统计的数据在hdfs中存放的位置

·output_path：指定对统计后的结果在hdfs中的存放位置

代码如下：

 
        import 
         java.io.IOException; 
       
        import 
         java.util.StringTokenizer; 
       
        import 
         org.apache.hadoop.conf.Configuration; 
       
        import 
         org.apache.hadoop.fs.Path; 
       
        import 
         org.apache.hadoop.io.IntWritable; 
       
        import 
         org.apache.hadoop.io.Text; 
       
        import 
         org.apache.hadoop.mapreduce.Mapper; 
       
        import 
         org.apache.hadoop.mapreduce.Reducer; 
       
        import 
         org.apache.hadoop.mapreduce.Job; 
       
        import 
         org.apache.hadoop.mapreduce.lib.input.*; 
       
        import 
         org.apache.hadoop.mapreduce.lib.output.*; 
       
        public 
         class 
         wordcount { 
       
        //继承Mapper，对数据进行拆分，其中<object,text>为数据的输入类型，<text,intwritable>为数据的输出类型 
       
        public 
         static 
         class 
         TokenizerMapper  
        extends 
         Mapper<object,text,text,intwritable>{ 
       
        private 
         final 
         static 
         IntWritable one =  
        new 
         IntWritable( 
        1 
        ); 
       
        private 
         Text word =  
        new 
         Text(); 
       
        //map方法的重写，将数据拆分成<word,one>的形式，将数据以<text,intwritable>的形式传送到reduce 
       
        public 
         void 
         map(Object key, Text value, Context context)  
        throws 
         IOException, InterruptedException { 
       
        StringTokenizer it =  
        new 
         StringTokenizer(value.toString()); 
       
        while 
        (it.hasMoreTokens()){ 
       
        word.set(it.nextToken()); 
       
        context.write(word, one); 
       
        } 
       
        }  
       
        } 
       
        //继承Reducer，通过shuffle阶段获得Map处理后的<word,one>的值，对数据进行汇总合并后以<word,result>的形式输出 
       
        //其中<text, intwritable="">为输入的格式,<text,intwritable>为输出的格式 
       
        public 
         static 
         class 
         IntSumReducer  
        extends 
         Reducer<text, intwritable= 
        "" 
        >{ 
       
        private 
         IntWritable result =  
        new 
         IntWritable(); 
       
        //重写reduce方法，对词频进行统计，其中输入的数据形式为<key,{1，1，1，1}>的形式 
       
        public 
         void 
         reduce(Text key, Iterable<intwritable> values, Context context)  
        throws 
         IOException, InterruptedException { 
       
        int 
         sum =  
        0 
        ; 
       
        //将相同的key的value值进行相加，得出词频结果 
       
        for 
        (IntWritable val : values){ 
       
        sum = sum + val.get(); 
       
        } 
       
        result.set(sum); 
       
        context.write(key, result); 
       
        } 
       
        } 
       
        public 
         static 
         void 
         wordcountMapReduce(Path input,Path output,Configuration conf)  
        throws 
         IOException, InterruptedException, Exception{ 
       
        //建立job任务 
       
        Job job = Job.getInstance(conf, 
        "word count" 
        ); 
       
        //配置job中的各个类 
       
        job.setJarByClass(wordcount. 
        class 
        ); 
       
        job.setMapperClass(TokenizerMapper. 
        class 
        ); 
       
        //combine方法是在reduce之前对map处理结果的一个局部汇总，一般有几个map就会有几个combine 
       
        job.setCombinerClass(IntSumReducer. 
        class 
        ); 
       
        job.setReducerClass(IntSumReducer. 
        class 
        ); 
       
        job.setOutputKeyClass(Text. 
        class 
        ); 
       
        job.setOutputValueClass(IntWritable. 
        class 
        ); 
       
        FileInputFormat.addInputPath(job,input); 
       
        FileOutputFormat.setOutputPath(job,output); 
       
        //提交任务 
       
        System.exit(job.waitForCompletion( 
        true 
        )?  
        0 
        : 
        1 
        ); 
       
        } 
       
        public 
         static 
         void 
         main(String[] arg)  
        throws 
         Exception{ 
       
        Configuration conf =  
        new 
         Configuration(); 
       
        //从命令行中获取输入输出的路径 
       
        Path input =  
        new 
         Path(arg[ 
        1 
        ]); 
       
        Path output =  
        new 
         Path(arg[ 
        2 
        ]); 
       
        //执行mapreduce方法 
       
        wordcountMapReduce(input,output,conf); 
       
        } 
       
        } 
       
        </intwritable></key,{ 
        1 
        ， 
        1 
        ， 
        1 
        ， 
        1 
        }></text,></text,intwritable></text,></word,result></word,one></text,intwritable></word,one></object,text,text,intwritable></text,intwritable></object,text>

该方法的主要流程为：

·map阶段：将数据进行拆分，拆分成的形式

·combine阶段：对每一个map拆分的数据进行一个局部的reduce操作，一般情况下combine函数和reduce是一样的，也可以根据自己的需要进行编写

·shuffle阶段：将数据从map拉取到reduce中

·reduce阶段：对所有的输入数据进行合并统计

MapReduce示例：WordCount（hadoop streaming）

Hadoop Streaming简单的来说，就是Hadoop提供了Streaming的方法，可以用除java以外的其他语言来编写Mapeduce，也就是说，我们可以用除java以外的其他语言编写一个MapReduce方法将其传给Streaming程序，该程序可以创建一个MR任务提交给Hadoop处理。

用python实现的WordCount程序，在集群中的触发指令为：

$hadoop jar xxx/xxx/hadoop-streaming.jar -mapper 'python xx/xx/map.py'

-filexx/xx/map.py#指定map.py执行文件的位置

-reducer 'python xx/xx/reduce.py'

-filexx/xx/reduce.py#指定reduce.py执行文件的位置

-input input_path#指定输入文件路径

-outputoutput_path#指定输出文件路径

-jobconf mapred.reduce.tasks=20#根据情况来指定reduce的个数

#注意：在'-mapper '和'-reducer '参数设定中一定要加上python，表示该文件用python编译执行，否则会出现java.lang.RuntimeException:PipeMapRed.waitOutputThreads()错误。

map.py程序

 
        import 
         sys 
       
        #其中sys.stdin为标准输入 
       
        for 
         line in sys.stdin: 
       
        #对输入的文件中的数据进行拆分 
       
        line = line.strip() 
       
        words = line.split() 
       
        for 
         word in words: 
       
        #将数据以<key,value>的形式输出 
       
        print  
        '%s\t%s' 
         %(word,  
        1 
        ) 
       
        </key,value>

reduce.py程序

 
        import 
         sys 
       
        #设置并初始化变量 
       
        current_word = None 
       
        current_count =  
        0 
       
        word = None 
       
        #获取map输出的<key,value>值 
       
        for 
         line in sys.stdin: 
       
        line = line.strip() 
       
        #判断空白行 
       
        if 
         line ==  
        '' 
        : 
       
        continue 
       
        else 
        : 
       
        word, count = line.split( 
        '\t' 
        ,  
        1 
        ) 
       
        try 
        : 
       
        #将count转换成整型，这里的 
        try 
        是判断如果count不是数字，这直接跳过这一行数据 
       
        count =  
        int 
        (count) 
       
        except ValueError: 
       
        continue 
       
        #对数据进行统计操作 
       
        if 
         current_word == word: 
       
        current_count = current_count + count 
       
        else 
        : 
       
        if 
         current_word: 
       
        print  
        '%s\t%s' 
         %(current_word, current_count) 
       
        current_word = word 
       
        current_count = count 
       
        if 
         current_word == word: 
       
        print  
        '%s\t%s' 
         %(current_word, current_count) 
       
        </key,value>

整体的主要流程和java原生的WordCount是一样的。

1. hadoop本身是用java写的，所以用java写mapreduce是比较合适的，然而hadoop提供了Streaming的方式，让很多语言可以来写mapreduce，下面就介绍如何用python写一个mapreduce程序，我们就从最简单的word count写起吧

2. word count是比较简单的，所以我们直接上代码，

3. map.py

[python] view plain copy

#!/usr/bin/env python
# vim: set fileencoding=utf-8
import sys
def read_from_input(file):
for line in file:
yield line.split(' ')
def main(separator = ' '):
data = read_from_input(sys.stdin)
for words in data:
for word in words:
# write to the reduce
print '%s%s%d' % (word, '\t', 1)
if __name__ == '__main__':
main()

这个还是比较简单的，输入是从标准输入得到的，最后输出到reduce端是<word, 1>的形式，相当于用java写的那个context.write(key, value)

4. red.py

[python] view plain copy

#!/usr/bin/env python
# vim: set fileencoding=utf-8
import sys
from itertools import groupby
from operator import itemgetter
def read_from_mapper(file, separator):
for line in file:
yield line.strip().split(separator, 2)
def main(separator = '\t'):
data = read_from_mapper(sys.stdin, separator)
for current_word, group in groupby(data, itemgetter(0)):
try:
total_count = sum(int(count) for current_word, count in group)
print "%s%s%d" % (current_word, separator, total_count)
except ValueError:
pass
if __name__ == '__main__':
main()

reduce的代码还是稍微有点复杂的，主要他不像用java写那么简便，会直接给你生成<word, list>这样的形式，所以我们就必须自己进行group，这里就用到了python的几个module：itertools和operator，既然他不给我们<word, list>的形式，那么我们就取构造这样的形式，首先输入是从map端过来的，是<word'\t'1>这样的形式，我们用yield将他们组装成generator, 然后 for current_word, group in groupby(data, itemgetter(0))是在data中进行group。key呢就是data里面项的第一个item，其实就是我们的word，最后用了一个简单的列表推导式进行统计每个word的个数，最后输出。

5. 我们需要用shell来运行他

[plain] view plain copy

#!/bin/bash
hadoop jar ../hadoop-streaming-2.0.0-mr1-cdh4.7.0.jar \
-input /user/upsmart/output/qianjc/python/input \
-output /user/upsmart/output/qianjc/python/output \
-file map.py \
-file red.py \
-mapper "python map.py" \
-reducer "python red.py" \
-jobconf mapred.reduce.tasks=1 \
-jobconf mapred.job.name="qianjc_test"

指定hadoop-streaming的jar包位置,几个参数的解释如下：

-input hdfs的输入位置

-output 结果写入hdfs的位置

-file 加载辞典，其实就是在运行的时候会将这些file拷贝到每个tasktracker的机器上

-mapper map的代码

-reducer reduce的代码

-jobconf mapred.reduce.tasks 设置reduce的个数

-jobconf mapred.job.name 设置job的名字

6. 本文主要讲了如何用python写简单mapreduce, 学会了这个处理一些简单的问题就比较迅速了，毕竟写脚本是比较快的

其实我们可以不直接在集群中运行，我们可以先看看这2个python写得对不对，我们可以这么测试：

cat xxxx.txt | ./map.py | sort | ./reduce.py，然后看输出对不对

7. 如果想了解更多hadoop streaming编程可以访问如下链接：

http://dongxicheng.org/mapreduce/hadoop-streaming-programming/

http://dongxicheng.org/mapreduce/hadoop-streaming-advanced-programming/

http://www.michael-noll.com/tutorials/writing-an-hadoop-mapreduce-program-in-python/

http://cs.smith.edu/dftwiki/index.php/Hadoop_Tutorial_2_--_Running_WordCount_in_Python

1. 输入文件：
姓名年龄（以'/t’分割）
eg：
张三 15
李四 15
张三 16
张三 15

输出：将姓名和年龄相同的归一，并输出其人数
eg：上述输入，输出为：

姓名年龄人数（以'/t’分割）
张三 15   2
李四 15   1
张三 16   1

2. map程序：

 
  #include  
  #include  
    
  using namespace std; 
    
  int main(int argc, char** argv) 
  { 
      string name,age; 
    
      //读入姓名、年龄 
      while(cin >> name >> age) 
      { 
          //输出姓名、年龄、人数 
          cout << name << "/t" << age  << "/t" << "1" << endl; 
      } 
      return 0; 
  } 
 

编译生成可执行程序：
g++ -o mapper mapper.cpp

3. reducer程序：

 
  #include  
  #include  
  #include  
    
  using namespace std; 
    
  int main(int argc, char** argv) 
  { 
      string key, value; 
      int num; 
       
      //个数统计 
      mapint> count_stat; 
      mapint>::iterator it_count_stat; 
       
      //读入数据并插入map 
      while(cin >> key >> value >> num) 
      { 
          string tmp_key = key + "/t" + value; 
       
          //插入数据 
          it_count_stat = count_stat.find(tmp_key); 
          if(it_count_stat != count_stat.end()) 
          { 
              (it_count_stat->second)++; 
          } 
          else 
          { 
              count_stat.insert(make_pair(tmp_key, 1)); 
          } 
      } 
    
      //输出统计结果 
      for(it_count_stat = count_stat.begin(); it_count_stat != count_stat.end(); it_count_stat++) 
      { 
          cout<first<<"/t"<second<


          }     return 0;} 
 

编译生成可执行程序：
g++ -o reducer reducer.cpp

4. 测试数据：

5. 单机测试运行：

 
  $ cat test.txt | ./mapper  | ./reducer  
      李四    20    1 
      张三    15    2 
      张三    16    1 
 

6. Hadoop集群运行：
以'/t’作为分隔符，并以前两个字段作为key，reduce任务3个，输入命令：

 
  $ hadoop fs -put test.txt /user/test.txt 
  $ hadoop streaming -D stream.map.output.field.separator='/t' / 
      -D stream.num.map.output.key.fields=2 / 
      -input /user/test.txt / 
      -output /user/tmp_1324 / 
      -mapper ./mapper -reducer ./reducer / 
      -file mapper -file reducer / 
      -jobconf mapred.reduce.tasks=3 / 
      -jobconf mapre.job.name="c++_test"  
 

7.查看输出：

 
  $ hadoop fs -cat /user/tmp_1324/part-00000 
  李四    20      1 
  张三    16      1 
  $ hadoop fs -cat /user/part-00001 
  $ hadoop fs -cat /user/part-00002 
  张三    15      2