mapreduce join 算法

一、概述

    对于RDBMS中的join操作大伙一定非常熟悉,写sql的时候要十分注意细节,稍有差池就会耗时巨久造成很大的性能瓶颈,而在Hadoop中使用MapReduce框架进行join的操作时同样耗时,但是由于hadoop的分布式设计理念的特殊性,因此对于这种join操作同样也具备了一定的特殊性。本文主要对MapReduce框架对表之间的join操作的几种实现方式进行详细分析,并且根据我在实际开发过程中遇到的实际例子来进行进一步的说明。

二、实现原理

1、在Reudce端进行连接。

   在Reudce端进行连接是MapReduce框架进行表之间join操作最为常见的模式,其具体的实现原理如下:

Map端的主要工作:为来自不同表(文件)的key/value对打标签以区别不同来源的记录。然后用连接字段作为key,其余部分和新加的标志作为value,最后进行输出。

reduce端的主要工作:在reduce端以连接字段作为key的分组已经完成,我们只需要在每一个分组当中将那些来源于不同文件的记录(在map阶段已经打标志)分开,最后进行笛卡尔只就ok了。原理非常简单,下面来看一个实例:

(1)自定义一个value返回类型:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
package  com.mr.reduceSizeJoin;
import  java.io.DataInput;
import  java.io.DataOutput;
import  java.io.IOException;
import  org.apache.hadoop.io.Text;
import  org.apache.hadoop.io.WritableComparable;
public  class  CombineValues  implements  WritableComparable<CombineValues>{
     //private static final Logger logger = LoggerFactory.getLogger(CombineValues.class);
     private  Text joinKey; //链接关键字
     private  Text flag; //文件来源标志
     private  Text secondPart; //除了链接键外的其他部分
     public  void  setJoinKey(Text joinKey) {
         this .joinKey = joinKey;
     }
     public  void  setFlag(Text flag) {
         this .flag = flag;
     }
     public  void  setSecondPart(Text secondPart) {
         this .secondPart = secondPart;
     }
     public  Text getFlag() {
         return  flag;
     }
     public  Text getSecondPart() {
         return  secondPart;
     }
     public  Text getJoinKey() {
         return  joinKey;
     }
     public  CombineValues() {
         this .joinKey =   new  Text();
         this .flag =  new  Text();
         this .secondPart =  new  Text();
     }
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          
     @Override
     public  void  write(DataOutput out)  throws  IOException {
         this .joinKey.write(out);
         this .flag.write(out);
         this .secondPart.write(out);
     }
     @Override
     public  void  readFields(DataInput in)  throws  IOException {
         this .joinKey.readFields(in);
         this .flag.readFields(in);
         this .secondPart.readFields(in);
     }
     @Override
     public  int  compareTo(CombineValues o) {
         return  this .joinKey.compareTo(o.getJoinKey());
     }
     @Override
     public  String toString() {
         // TODO Auto-generated method stub
         return  "[flag=" + this .flag.toString()+ ",joinKey=" + this .joinKey.toString()+ ",secondPart=" + this .secondPart.toString()+ "]" ;
     }
}

(2)map、reduce主体代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
package  com.mr.reduceSizeJoin;
import  java.io.IOException;
import  java.util.ArrayList;
import  org.apache.hadoop.conf.Configuration;
import  org.apache.hadoop.conf.Configured;
import  org.apache.hadoop.fs.Path;
import  org.apache.hadoop.io.Text;
import  org.apache.hadoop.mapreduce.Job;
import  org.apache.hadoop.mapreduce.Mapper;
import  org.apache.hadoop.mapreduce.Reducer;
import  org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import  org.apache.hadoop.mapreduce.lib.input.FileSplit;
import  org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import  org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import  org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import  org.apache.hadoop.util.Tool;
import  org.apache.hadoop.util.ToolRunner;
import  org.slf4j.Logger;
import  org.slf4j.LoggerFactory;
/**
  * @author zengzhaozheng
  * 用途说明:
  * reudce side join中的left outer join
  * 左连接,两个文件分别代表2个表,连接字段table1的id字段和table2的cityID字段
  * table1(左表):tb_dim_city(id int,name string,orderid int,city_code,is_show)
  * tb_dim_city.dat文件内容,分隔符为"|":
  * id     name  orderid  city_code  is_show
  * 0       其他        9999     9999         0
  * 1       长春        1        901          1
  * 2       吉林        2        902          1
  * 3       四平        3        903          1
  * 4       松原        4        904          1
  * 5       通化        5        905          1
  * 6       辽源        6        906          1
  * 7       白城        7        907          1
  * 8       白山        8        908          1
  * 9       延吉        9        909          1
  * -------------------------风骚的分割线-------------------------------
  * table2(右表):tb_user_profiles(userID int,userName string,network string,double flow,cityID int)
  * tb_user_profiles.dat文件内容,分隔符为"|":
  * userID   network     flow    cityID
  * 1           2G       123      1
  * 2           3G       333      2
  * 3           3G       555      1
  * 4           2G       777      3
  * 5           3G       666      4
  *
  * -------------------------风骚的分割线-------------------------------
  *  结果:
  *  1   长春  1   901 1   1   2G  123
  *  1   长春  1   901 1   3   3G  555
  *  2   吉林  2   902 1   2   3G  333
  *  3   四平  3   903 1   4   2G  777
  *  4   松原  4   904 1   5   3G  666
  */
public  class  ReduceSideJoin_LeftOuterJoin  extends  Configured  implements  Tool{
     private  static  final  Logger logger = LoggerFactory.getLogger(ReduceSideJoin_LeftOuterJoin. class );
     public  static  class  LeftOutJoinMapper  extends  Mapper<Object, Text, Text, CombineValues> {
         private  CombineValues combineValues =  new  CombineValues();
         private  Text flag =  new  Text();
         private  Text joinKey =  new  Text();
         private  Text secondPart =  new  Text();
         @Override
         protected  void  map(Object key, Text value, Context context)
                 throws  IOException, InterruptedException {
             //获得文件输入路径
             String pathName = ((FileSplit) context.getInputSplit()).getPath().toString();
             //数据来自tb_dim_city.dat文件,标志即为"0"
             if (pathName.endsWith( "tb_dim_city.dat" )){
                 String[] valueItems = value.toString().split( "\\|" );
                 //过滤格式错误的记录
                 if (valueItems.length !=  5 ){
                     return ;
                 }
                 flag.set( "0" );
                 joinKey.set(valueItems[ 0 ]);
                 secondPart.set(valueItems[ 1 ]+ "\t" +valueItems[ 2 ]+ "\t" +valueItems[ 3 ]+ "\t" +valueItems[ 4 ]);
                 combineValues.setFlag(flag);
                 combineValues.setJoinKey(joinKey);
                 combineValues.setSecondPart(secondPart);
                 context.write(combineValues.getJoinKey(), combineValues);
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                
             } //数据来自于tb_user_profiles.dat,标志即为"1"
             else  if (pathName.endsWith( "tb_user_profiles.dat" )){
                 String[] valueItems = value.toString().split( "\\|" );
                 //过滤格式错误的记录
                 if (valueItems.length !=  4 ){
                     return ;
                 }
                 flag.set( "1" );
                 joinKey.set(valueItems[ 3 ]);
                 secondPart.set(valueItems[ 0 ]+ "\t" +valueItems[ 1 ]+ "\t" +valueItems[ 2 ]);
                 combineValues.setFlag(flag);
                 combineValues.setJoinKey(joinKey);
                 combineValues.setSecondPart(secondPart);
                 context.write(combineValues.getJoinKey(), combineValues);
             }
         }
     }
     public  static  class  LeftOutJoinReducer  extends  Reducer<Text, CombineValues, Text, Text> {
         //存储一个分组中的左表信息
         private  ArrayList<Text> leftTable =  new  ArrayList<Text>();
         //存储一个分组中的右表信息
         private  ArrayList<Text> rightTable =  new  ArrayList<Text>();
         private  Text secondPar =  null ;
         private  Text output =  new  Text();
         /**
          * 一个分组调用一次reduce函数
          */
         @Override
         protected  void  reduce(Text key, Iterable<CombineValues> value, Context context)
                 throws  IOException, InterruptedException {
             leftTable.clear();
             rightTable.clear();
             /**
              * 将分组中的元素按照文件分别进行存放
              * 这种方法要注意的问题:
              * 如果一个分组内的元素太多的话,可能会导致在reduce阶段出现OOM,
              * 在处理分布式问题之前最好先了解数据的分布情况,根据不同的分布采取最
              * 适当的处理方法,这样可以有效的防止导致OOM和数据过度倾斜问题。
              */
             for (CombineValues cv : value){
                 secondPar =  new  Text(cv.getSecondPart().toString());
                 //左表tb_dim_city
                 if ( "0" .equals(cv.getFlag().toString().trim())){
                     leftTable.add(secondPar);
                 }
                 //右表tb_user_profiles
                 else  if ( "1" .equals(cv.getFlag().toString().trim())){
                     rightTable.add(secondPar);
                 }
             }
             logger.info( "tb_dim_city:" +leftTable.toString());
             logger.info( "tb_user_profiles:" +rightTable.toString());
             for (Text leftPart : leftTable){
                 for (Text rightPart : rightTable){
                     output.set(leftPart+  "\t"  + rightPart);
                     context.write(key, output);
                 }
             }
         }
     }
     @Override
     public  int  run(String[] args)  throws  Exception {
           Configuration conf=getConf();  //获得配置文件对象
             Job job= new  Job(conf, "LeftOutJoinMR" );
             job.setJarByClass(ReduceSideJoin_LeftOuterJoin. class );
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
             FileInputFormat.addInputPath(job,  new  Path(args[ 0 ]));  //设置map输入文件路径
             FileOutputFormat.setOutputPath(job,  new  Path(args[ 1 ]));  //设置reduce输出文件路径
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 
             job.setMapperClass(LeftOutJoinMapper. class );
             job.setReducerClass(LeftOutJoinReducer. class );
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           
             job.setInputFormatClass(TextInputFormat. class );  //设置文件输入格式
             job.setOutputFormatClass(TextOutputFormat. class ); //使用默认的output格式
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
             //设置map的输出key和value类型
             job.setMapOutputKeyClass(Text. class );
             job.setMapOutputValueClass(CombineValues. class );
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
             //设置reduce的输出key和value类型
             job.setOutputKeyClass(Text. class );
             job.setOutputValueClass(Text. class );
             job.waitForCompletion( true );
             return  job.isSuccessful()? 0 : 1 ;
     }
     public  static  void  main(String[] args)  throws  IOException,
             ClassNotFoundException, InterruptedException {
         try  {
             int  returnCode =  ToolRunner.run( new  ReduceSideJoin_LeftOuterJoin(),args);
             System.exit(returnCode);
         catch  (Exception e) {
             // TODO Auto-generated catch block
             logger.error(e.getMessage());
         }
     }
}

其中具体的分析以及数据的输出输入请看代码中的注释已经写得比较清楚了,这里主要分析一下reduce join的一些不足。之所以会存在reduce join这种方式,我们可以很明显的看出原:因为整体数据被分割了,每个map task只处理一部分数据而不能够获取到所有需要的join字段,因此我们需要在讲join key作为reduce端的分组将所有join key相同的记录集中起来进行处理,所以reduce join这种方式就出现了。这种方式的缺点很明显就是会造成map和reduce端也就是shuffle阶段出现大量的数据传输,效率很低。

2、在Map端进行连接。

   使用场景:一张表十分小、一张表很大。

   用法:在提交作业的时候先将小表文件放到该作业的DistributedCache中,然后从DistributeCache中取出该小表进行join key / value解释分割放到内存中(可以放大Hash Map等等容器中)。然后扫描大表,看大表中的每条记录的join key /value值是否能够在内存中找到相同join key的记录,如果有则直接输出结果。

直接上代码,比较简单:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
package  com.mr.mapSideJoin;
import  java.io.BufferedReader;
import  java.io.FileReader;
import  java.io.IOException;
import  java.util.HashMap;
import  org.apache.hadoop.conf.Configuration;
import  org.apache.hadoop.conf.Configured;
import  org.apache.hadoop.filecache.DistributedCache;
import  org.apache.hadoop.fs.Path;
import  org.apache.hadoop.io.Text;
import  org.apache.hadoop.mapreduce.Job;
import  org.apache.hadoop.mapreduce.Mapper;
import  org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import  org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import  org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import  org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import  org.apache.hadoop.util.Tool;
import  org.apache.hadoop.util.ToolRunner;
import  org.slf4j.Logger;
import  org.slf4j.LoggerFactory;
/**
  * @author zengzhaozheng
  *
  * 用途说明:
  * Map side join中的left outer join
  * 左连接,两个文件分别代表2个表,连接字段table1的id字段和table2的cityID字段
  * table1(左表):tb_dim_city(id int,name string,orderid int,city_code,is_show),
  * 假设tb_dim_city文件记录数很少,tb_dim_city.dat文件内容,分隔符为"|":
  * id     name  orderid  city_code  is_show
  * 0       其他        9999     9999         0
  * 1       长春        1        901          1
  * 2       吉林        2        902          1
  * 3       四平        3        903          1
  * 4       松原        4        904          1
  * 5       通化        5        905          1
  * 6       辽源        6        906          1
  * 7       白城        7        907          1
  * 8       白山        8        908          1
  * 9       延吉        9        909          1
  * -------------------------风骚的分割线-------------------------------
  * table2(右表):tb_user_profiles(userID int,userName string,network string,double flow,cityID int)
  * tb_user_profiles.dat文件内容,分隔符为"|":
  * userID   network     flow    cityID
  * 1           2G       123      1
  * 2           3G       333      2
  * 3           3G       555      1
  * 4           2G       777      3
  * 5           3G       666      4
  * -------------------------风骚的分割线-------------------------------
  *  结果:
  *  1   长春  1   901 1   1   2G  123
  *  1   长春  1   901 1   3   3G  555
  *  2   吉林  2   902 1   2   3G  333
  *  3   四平  3   903 1   4   2G  777
  *  4   松原  4   904 1   5   3G  666
  */
public  class  MapSideJoinMain  extends  Configured  implements  Tool{
     private  static  final  Logger logger = LoggerFactory.getLogger(MapSideJoinMain. class );
     public  static  class  LeftOutJoinMapper  extends  Mapper<Object, Text, Text, Text> {
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   
         private  HashMap<String,String> city_info =  new  HashMap<String, String>();
         private  Text outPutKey =  new  Text();
         private  Text outPutValue =  new  Text();
         private  String mapInputStr =  null ;
         private  String mapInputSpit[] =  null ;
         private  String city_secondPart =  null ;
         /**
          * 此方法在每个task开始之前执行,这里主要用作从DistributedCache
          * 中取到tb_dim_city文件,并将里边记录取出放到内存中。
          */
         @Override
         protected  void  setup(Context context)
                 throws  IOException, InterruptedException {
             BufferedReader br =  null ;
             //获得当前作业的DistributedCache相关文件
             Path[] distributePaths = DistributedCache.getLocalCacheFiles(context.getConfiguration());
             String cityInfo =  null ;
             for (Path p : distributePaths){
                 if (p.toString().endsWith( "tb_dim_city.dat" )){
                     //读缓存文件,并放到mem中
                     br =  new  BufferedReader( new  FileReader(p.toString()));
                     while ( null !=(cityInfo=br.readLine())){
                         String[] cityPart = cityInfo.split( "\\|" , 5 );
                         if (cityPart.length == 5 ){
                             city_info.put(cityPart[ 0 ], cityPart[ 1 ]+ "\t" +cityPart[ 2 ]+ "\t" +cityPart[ 3 ]+ "\t" +cityPart[ 4 ]);
                         }
                     }
                 }
             }
         }
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   
         /**
          * Map端的实现相当简单,直接判断tb_user_profiles.dat中的
          * cityID是否存在我的map中就ok了,这样就可以实现Map Join了
          */
         @Override
         protected  void  map(Object key, Text value, Context context)
                 throws  IOException, InterruptedException {