需求:将商品数据和订单数据关联起来
有如下两组数据:
01 小米
02 华为
03 格力
04 8848
05 Dell
......
-----------------------------
201901 01 1
201902 02 2
201903 03 3
201904 01 4
201905 02 5
201906 03 6
201906 04 9
201904 05 4
201905 05 5
201906 03 6
201906 04 9
......
现在需要使用MapJoin将两组数据关联起来,类似SQL的多表关联查询:
1:创建DistributedCacheMapper类
package MapJoin; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.*; import java.util.HashMap; import java.util.Map; /** * 1.两个表join * 2.小表加缓存 * 3.setup方法的使用 */ public class DistributedCacheMapper extends Mapper<LongWritable, Text,Text, NullWritable> { Map<String, String> pdMap = new HashMap<>(); Text k = new Text(); /** * * 初始化方法 * 把pd.txt加载进来 * @param context * @throws IOException * @throws InterruptedException */ @Override protected void setup(Context context) throws IOException, InterruptedException { //将pd.txt文件加载进来 BufferedReader reader = new BufferedReader(new InputStreamReader( new FileInputStream(new File("E:\\bigdata_code\\pd.txt")), "UTF-8")); String line; //导包注意:org.apache.commons.lang.StringUtils,是commons包,不是hadoop包 while (StringUtils.isNotEmpty(line = reader.readLine())) { String[] fields = line.split(" "); //产品id,产品名字 String pid = fields[0]; String pname = fields[1]; pdMap.put(pid,pname); } reader.close(); } /** * order.txt的处理 * * @param key * @param value * @param context * @throws IOException * @throws InterruptedException */ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //转类型 String line = value.toString(); //切分 String[] fields = line.split(" "); //订单id、产品id、产品数量 String orderId = fields[0]; String pid = fields[1]; String amount = fields[2]; //通过pid(key),拿到pname(value) String pname = pdMap.get(pid); //数据字段拼接 k.set(orderId + "\t" + pname + "\t" + amount ); context.write(k,NullWritable.get()); } }
2:创建DistributedCacheDriver类
package MapJoin; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; public class DistributedCacheDriver { public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException { args = new String[]{"E:\\bigdata_code\\order.txt","E:\\bigdata_code\\our1"}; //1:获取job信息 Configuration conf = new Configuration(); Job job = Job.getInstance(conf); //2:设置加载包路径 job.setJarByClass(DistributedCacheDriver.class); //3:关联map,设置输出数据类型 job.setMapperClass(DistributedCacheMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); //4:设置输入输出路径 FileInputFormat.setInputPaths(job,new Path(args[0])); FileOutputFormat.setOutputPath(job,new Path(args[1])); job.addCacheFile(new URI("file:///e:/bigdata_code/pd.txt")); job.setNumReduceTasks(0); //5:提交 job.waitForCompletion(true); } } 运行后输出结果如下:
201901 小米 1
201902 华为 2
201903 格力 3
201904 小米 4
201905 华为 5
201906 格力 6
201906 8848 9
201904 戴尔 4
201905 戴尔 5
201906 格力 6
201906 8848 9