Hadoop(五)——核心编程MapReduce（下）

最新推荐文章于 2023-06-02 22:15:11 发布

AlbenXie

最新推荐文章于 2023-06-02 22:15:11 发布

阅读量496

点赞数

分类专栏：大数据云服务

大数据云服务专栏收录该内容

31 篇文章 0 订阅

订阅专栏

上篇博客最后我们讲述了WordCount的Hadoop官方源码，主要看map类的编写规则，入参（从文件）出参（经过shuffle，combiner过程给reduce），reduce的编写规则，入参（从map类中获取），出参（想要的结果输出到文件中）。下边我们再进一步通过几个例子（在hadoop实战中摘取），来加深map-reduce的编程规则，至于具体到map，reduce内部，如何处理数据，则涉及到算法，因情况而异。在这里我们学到mapreduce的执行流程，编写规则等即可。

一，数据去重：顾名思义就是计算数据中的类别，将数据多余一次只显示一次。很简单的一个过程，对于mapreduce ,直接看代码，注意里边的注释:

[java]view plaincopy 
   
print?
 <span style="font-family:KaiTi_GB2312;font-size:18px;">package job;  
   
 import java.io.IOException;  
   
 import org.apache.hadoop.conf.Configuration;  
 import org.apache.hadoop.fs.Path;  
 import org.apache.hadoop.io.Text;  
 import org.apache.hadoop.mapreduce.Job;  
 import org.apache.hadoop.mapreduce.Mapper;  
 import org.apache.hadoop.mapreduce.Reducer;  
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
 import org.apache.hadoop.util.GenericOptionsParser;  
   
 /** 
  * 一，数据去重的Map-Reduce实例 
  *  
  * @author ljh 
  *  
  */  
 public class Dedup {  
   
     /* 
      * 1,Map将输入中的value复制到输出数据的key上，并直接输出,value值无所谓，利用shuffle这个工程进行key相同汇总 
      */  
     public static class Map extends Mapper<Object, Text, Text, Text> {  
         // 定义line存储每行的数据  
         private static Text line = new Text();  
   
         // map函数直接将value复制给line，然后输出即可  
         public void map(Object key, Text value, Context context)  
                 throws IOException, InterruptedException {  
             line = value;  
             context.write(line, new Text(""));  
         }  
     }  
   
     /* 
      * 2,reduce将输入的key复制到输出数据的key上，并直接输出 
      */  
     public static class Reduce extends Reducer<Text, Text, Text, Text> {  
         // reduce函数，利用shuffle处理好的，直接输出即可，比较简单  
         public void reduce(Text key, Text values, Context context)  
                 throws IOException, InterruptedException {  
             context.write(key, new Text(""));  
         }  
     }  
   
     /* 
      * 3,main方法 
      */  
     public static void main(String[] args) throws Exception {  
         Configuration conf = new Configuration();  
         // 获取输入文件和输出文件的地址  
         String[] otherArgs = new GenericOptionsParser(conf, args)  
                 .getRemainingArgs();  
         if (otherArgs.length != 2) {  
             System.err.println("Usage:  in and out");  
             System.exit(2);  
         }  
   
         Job job = new Job(conf, "Data deduplication");  
         job.setJarByClass(Dedup.class);  
         job.setMapperClass(Map.class);  
         job.setCombinerClass(Reduce.class);  
         job.setReducerClass(Reduce.class);  
         job.setOutputKeyClass(Text.class);  
         job.setOutputValueClass(Text.class);  
         FileInputFormat.addInputPath(job, new Path(otherArgs[0]));  
         FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));  
         System.exit(job.waitForCompletion(true) ? 0 : 1);  
   
     }  
 }  
 </span>  

二，排序：这个在我们写SQL中利用order by 列名desc/asc进行排序即可，Java中我们有基本的冒泡排序，选择排序，顺序排序等，还有堆排序，归并排序，基数排序等，这里看两篇文章：http://blog.csdn.NET/ygc87/article/details/7208082 和 http://www.cnblogs.com/liuling/p/2013-7-24-01.html 。而在mapreduce中就有默认的排序，如果key封装的int的IntWritable,则会按照数字的大小进行排序；如果封装的String的Text,则会按照字典的顺序字符串进行排序。好，看下边的程序代码：

[java]view plaincopy 
   
print?
 <span style="font-family:KaiTi_GB2312;font-size:18px;">package job;  
   
 import java.io.IOException;  
   
 import org.apache.hadoop.conf.Configuration;  
 import org.apache.hadoop.fs.Path;  
 import org.apache.hadoop.io.IntWritable;  
 import org.apache.hadoop.io.Text;  
 import org.apache.hadoop.mapreduce.Job;  
 import org.apache.hadoop.mapreduce.Mapper;  
 import org.apache.hadoop.mapreduce.Reducer;  
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
 import org.apache.hadoop.util.GenericOptionsParser;  
   
 /** 
  * 二，排序的Map-Reduce实例：排序为一些数字 
  * @author ljh 
  * 
  */  
 public class Sort {  
     /* 
      * 1,map将输入中的value化成IntWritable类型，作为输出的key 
      */  
     public static class Map extends Mapper<Object, Text, IntWritable, IntWritable>{  
         //用来存储数据，由于是数字的排序，直接使用IntWritable即可  
         private static IntWritable data=new IntWritable();  
           
         public void map(Object key,Text value,Context context)throws IOException,InterruptedException{  
             String line=value.toString();  
             data.set(Integer.parseInt(line));  
             context.write(data, new IntWritable(1));  
         }  
     }  
       
     public static class Reduce extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable>{  
         //第一行的行号设置为1  
         private static IntWritable linenum=new IntWritable(1);  
           
         //已经自动排序好了，输出即可，这里我们来设置一下行号自动加1即可  
         public void reduce(IntWritable key, Iterable<IntWritable> values, Context context) throws IOException,InterruptedException{  
             for(IntWritable val:values){  
                 context.write(linenum, key);  
                 linenum=new IntWritable(linenum.get()+1);  
             }     
         }  
     }  
       
     /* 
      * 3,main方法 
      */  
     public static void main(String[] args) throws Exception {  
         Configuration conf = new Configuration();  
         // 获取输入文件和输出文件的地址  
         String[] otherArgs = new GenericOptionsParser(conf, args)  
                 .getRemainingArgs();  
         if (otherArgs.length != 2) {  
             System.err.println("Usage:  in and out");  
             System.exit(2);  
         }  
   
         Job job = new Job(conf, "Data Sort");  
         job.setJarByClass(Sort.class);  
         job.setMapperClass(Map.class);  
         //这里没有设置CombinerClass，CombinerClass是为了简述网络流量，为了是输出的数据只是必要的，例如上边的去重，先在本地进行去重，再进行传输整合，而这个例子进行的排序是没有必要的。  
         //job.setCombinerClass(Reduce.class);  
         job.setReducerClass(Reduce.class);  
         job.setOutputKeyClass(IntWritable.class);  
         job.setOutputValueClass(IntWritable.class);  
         FileInputFormat.addInputPath(job, new Path(otherArgs[0]));  
         FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));  
         System.exit(job.waitForCompletion(true) ? 0 : 1);  
   
     }  
 }  
 </span>  

三，单表关联：也就是我们经常见到的一张表中的父子关系，有其树形结构。在orcle中我们有经典的sql：select * from tb_menu m start with m.id=1 connect by m.parent=prior m.id;

在Java中我们使用递归操作来进行操作。下边通过child-parent表，来寻找grandchild-grandparent关系表，也就是通过父子关系数据，找出爷孙关系的数据对应，好，看下Map-reduce中操作：

[java]view plaincopy 
   
print?
 <span style="font-family:KaiTi_GB2312;font-size:18px;">package job;  
   
 import java.io.IOException;  
 import java.util.Iterator;  
   
 import org.apache.hadoop.conf.Configuration;  
 import org.apache.hadoop.fs.Path;  
 import org.apache.hadoop.io.Text;  
 import org.apache.hadoop.mapreduce.Job;  
 import org.apache.hadoop.mapreduce.Mapper;  
 import org.apache.hadoop.mapreduce.Reducer;  
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
 import org.apache.hadoop.util.GenericOptionsParser;  
   
 /** 
  * 三，单表关联查询父子关系的Map-Reduce实例：父子关系的表数据 
  *  
  * @author ljh 
  *  
  */  
 public class STJoin {  
     // 判断是否是首行，因为首行是标题：输入文件为child-parent,输出文件为grandchild-grandparent  
     public static int time = 0;  
   
     /* 
      * 1,map将输入分割为child和parent,然后正序输出依次作为右表，反序依次输出作为左表，需要注意在value中加上左表和右表的标识 
      */  
     public static class Map extends Mapper<Object, Text, Text, Text> {  
   
         public void map(Object key, Text value, Context context)  
                 throws IOException, InterruptedException {  
             String childName = new String();// 孩子的名称存储  
             String parentName = new String();// 父亲的名称存储  
             String relationType = new String();// 左表右表的标识1，表示左表，2表示右表  
             String line = value.toString();// 输入的每一行转化为字符串  
               
             int i = 0;  
             //当检索line的第i个字符不为tab制表符使，进行++，也就是为了下边的切割  
             while (line.charAt(i) != '  ') {  
                 i++;  
             }  
               
             //将child和parent放到数组当中，利用上边找到的分割符  
             String[] values = { line.substring(0, i), line.substring(i + 1) };  
   
             //当不是第一行标题的时候  
             if (values[0].compareTo("child") != 0) {  
                 //获取child和parent  
                 childName = values[0];  
                 parentName = values[1];  
                 // 左表，输出，左右表的key是一对父子关系  
                 relationType = "1";  
                 context.write(new Text(values[1]), new Text(relationType + "+"  
                         + childName + "+" + parentName));  
   
                 // 右表，输出  
                 relationType = "2";  
                 context.write(new Text(values[0]), new Text(relationType + "+"  
                         + childName + "+" + parentName));  
             }  
         }  
     }  
   
     /* 
      * 2,reduce:拿到map输出的左右表是一对负责关系表 
      * 左表：{tom,[{1,lucy,tom},{1,lili,tom}] 
      * 右表：{tom,[{2,tom,hali},{2,tom,karry}] 
      * 其实应该是合并好的：{tom,[{1,lucy,tom},{1,lili,tom},{2,tom,hali},{2,tom,karry}] 
      * 所有，我们取tom的父亲，和tom的孩子，就得到了爷孙关系 
      */  
     public static class Reduce extends Reducer<Text, Text, Text, Text> {  
           
         public void reduce(Text key, Iterable<Text> values, Context context)  
                 throws IOException, InterruptedException {  
             //如果是第一行，则输出表头信息  
             if (time == 0) {  
                 context.write(new Text("grandchild"), new Text("grandparent"));  
                 time++;  
             }  
   
             int grandchildnum = 0;//表示孙子的个数  
             String grandchild[] = new String[10];//存放孙子的数组  
             int grandparentnum = 0;//表示爷爷的个数  
             String grandparent[] = new String[10];//存放爷爷的数组  
             Iterator ite = values.iterator();//每一个儿子对应的父亲列表  
               
             //如果有父亲  
             while (ite.hasNext()) {  
                 //取出父亲的集合  
                 String record = ite.next().toString();  
                 //父亲集合的长度  
                 int len = record.length();  
                 int i = 2;  
                 if (len == 0) continue;//结束本次循环  
                   
                 char relationType = record.charAt(0);//取出是父表还是子表  
                 String childname = new String();  
                 String parentname = new String();  
                 //获取value-list中的value的child  
                 while (record.charAt(i) != '+') {  
                     childname = childname + record.charAt(i);  
                     i++;  
                 }  
                 i = i + 1;  
   
                 // 获取value-list中的value的parent  
                 while (i < len) {  
                     parentname = parentname + record.charAt(i);  
                     i++;  
                 }  
                 //如果是父表，取出孩子，如果是子表，取出父亲,组成了爷孙关系  
                 if (relationType == '1') {  
                     grandchild[grandchildnum] = childname;  
                     grandchildnum++;  
                 } else {  
                     grandparent[grandparentnum] = parentname;  
                     grandparentnum++;  
                 }  
             }  
             //爷孙求笛卡尔积，看有多少对组合  
             if (grandparentnum != 0 && grandchildnum != 0) {  
                 for (int m = 0; m < grandchildnum; m++) {  
                     for (int n = 0; n < grandparentnum; n++) {  
                         context.write(new Text(grandchild[m]), new Text(  
                                 grandparent[n]));  
                     }  
                 }  
             }  
         }  
     }  
   
     /* 
      * 3,main方法 
      */  
     public static void main(String[] args) throws Exception {  
         Configuration conf = new Configuration();  
         // 获取输入文件和输出文件的地址  
         String[] otherArgs = new GenericOptionsParser(conf, args)  
                 .getRemainingArgs();  
         if (otherArgs.length != 2) {  
             System.err.println("Usage:  in and out");  
             System.exit(2);  
         }  
   
         Job job = new Job(conf, "single table join");  
         job.setJarByClass(STJoin.class);  
         job.setMapperClass(Map.class);  
         // job.setCombinerClass(Reduce.class);  
         job.setReducerClass(Reduce.class);  
         job.setOutputKeyClass(Text.class);  
         job.setOutputValueClass(Text.class);  
         FileInputFormat.addInputPath(job, new Path(otherArgs[0]));  
         FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));  
         System.exit(job.waitForCompletion(true) ? 0 : 1);  
   
     }  
 }  
 </span>  

四，多表关联：其实上边的单表关联，我们就是将其转化为两张表进行关联的，其中的关键就是将两表中公用的字段设置为key，我们就能得到关于这个key的集合，其中有左表的数据，有右表的数据，然后我们取出来进行笛卡尔积即可。这里不再给出代码，大家好好思考一下，其实多表关联，比单表关联还简单，因为多表不需要我们进行抽象了，单表还需要我们抽象出两张表。

好，这篇简单讲述了map-reduce的几个例子，虽然简单，但是确实不得不走的路程，慢慢熟悉编写map-reduce的基本知识，然后通过场景，通过各种算法，通过灵活的大脑，来写出属于实际场景有价值的map-reduce来。

AlbenXie

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Hadoop(五)——核心编程MapReduce（下）

上篇博客最后我们讲述了WordCount的Hadoop官方源码，主要看map类的编写规则，入参（从文件）出参（经过shuffle，combiner过程给reduce），reduce的编写规则，入参（从map类中获取），出参（想要的结果输出到文件中）。下边我们再进一步通过几个例子（在hadoop实战中摘取），来加深map-reduce的编程规则，至于具体到map，reduce内部，如何处理数据，则涉
复制链接

扫一扫

专栏目录