在项目开发中,要实现两个“表”的join操作,其中一个表数据量小,一个表很大,这种场景在实际中非常常见,比如“订单日志” join “产品信息”采用map端连接
原理:适用于大表 + 小表(载入内存)。
map之前执行,加载文件到内存,形成map
可以大大提高join操作的并发度,加快处理速度
1、JoinMapper
package hadoop.join.map; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Mapper; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.HashMap; import java.util.Map; /** * Mapper */ public class JoinMapper extends Mapper<LongWritable,Text ,Text,NullWritable>{ private Map<String,String> customers ; /** * map之前执行,加载文件到内存,形成map */ protected void setup(Context context) throws IOException, InterruptedException { //加载customers.txt customers = new HashMap<String, String>(); String path = context.getConfiguration().get("customers.path") ; FSDataInputStream in = FileSystem.get(context.getConfiguration()).open(new Path(path)); BufferedReader br = new BufferedReader(new InputStreamReader(in)) ; String line = null ; while((line = br.readLine()) != null){ String[] arr = line.split(","); customers.put(arr[0] , line) ; }
}
2.Apppackage hadoop.join.map; import com.it18zhang.hadoop.lean.key.DataLeanMapper1; import com.it18zhang.hadoop.lean.key.DataLeanMapper2; import com.it18zhang.hadoop.lean.key.DataLeanReducer1; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; /** * join:map端连接 */ public class App { public static void main(String[] args) throws Exception { args = new String[]{"d:/java/mr/join/orders.txt", "d:/java/mr/out", "d:/java/mr/join/customers.txt" } ; Configuration conf = new Configuration(); conf.set("customers.path",args[2]); FileSystem fs = FileSystem.get(conf); if(fs.exists(new Path(args[1]))){ fs.delete(new Path(args[1]),true); } Job job = Job.getInstance(conf); job.setJobName("join-map"); job.setJarByClass(App.class); job.setMapperClass(JoinMapper.class); //添加输入路径 FileInputFormat.addInputPath(job,new Path(args[0])); //设置输出路径 FileOutputFormat.setOutputPath(job,new Path(args[1])); //设置mapreduce输出 job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(NullWritable.class); job.setNumReduceTasks(0); //第一个阶段(job) job.waitForCompletion(true) ; } }