MapReduce知识点(3)

Reduce端的Join

原数据:

//orders表
1001,20150710,p0001,2
1002,20150710,p0002,3
//product表
p0001,小米5,1000,2000
p0002,锤子T1,1000,3000

按照商品编号将两表连接起来,效果:

key			value
p0001		p0001,小米5,1000,2000	1001,20150710,p0001,2
p0002		p0002,锤子T1,1000,3000	1002,20150710,p0002,3
  1. mapper类

    在map中对来自不同数据文件的数据进行不同的转换

    package com.bigdata.mapreduce.reducejoin;
    
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.lib.input.FileSplit;
    
    import java.io.IOException;
    
    public class j_mapper extends
            Mapper<LongWritable, Text,Text,Text> {
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            //1. 判断数据来自哪个文件
            FileSplit filesplit = (FileSplit) context.getInputSplit();
            String name = filesplit.getPath().getName();
            //2. 转换
            if(name.equals("orders.txt")){
                Text text = new Text(value.toString().split(",")[2]);
                context.write(text,value);
            }else{
                Text text = new Text(value.toString().split(",")[0]);
                context.write(text,value);
            }
        }
    }
    
    
  2. reducer类

    将数据进行拼接

    package com.bigdata.mapreduce.reducejoin;
    
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;
    
    import java.io.IOException;
    
    public class j_reducer extends
            Reducer<Text,Text,Text,Text> {
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            String first = "";
            for (Text value : values){
                first = first + value + "\t";
            }
            context.write(key,new Text(first));
        }
    }
    
    
  3. jobmain

    package com.bigdata.mapreduce.reducejoin;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.conf.Configured;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
    import org.apache.hadoop.util.Tool;
    import org.apache.hadoop.util.ToolRunner;
    
    public class join_jobmain extends Configured implements Tool {
        @Override
        public int run(String[] strings) throws Exception {
            //创建job任务对象
            Job job = Job.getInstance(super.getConf(), "join");
            //配置job任务
            //配置输入类和路径
            job.setInputFormatClass(TextInputFormat.class);
            TextInputFormat.addInputPath(job,new Path("file:///D:\\mapreduce_demo\\double"));
            //配置mapper和K2,V2类型
            job.setMapperClass(j_mapper.class);
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Text.class);
            //分区、排序、规约、分组
    
            //指定reducer类和K3,V3类型
            job.setReducerClass(j_reducer.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);
            //设置输出类型和路径
            job.setOutputFormatClass(TextOutputFormat.class);
            TextOutputFormat.setOutputPath(job,new Path("file:///D:\\mapreduce_demo\\joinresult"));
            //等待任务结束
            boolean flag = job.waitForCompletion(true);
            return flag?0:1;
        }
    
        public static void main(String[] args) throws Exception {
            Configuration entries = new Configuration();
            int run = ToolRunner.run(entries, new join_jobmain(), args);
            System.exit(run);
        }
    }
    
    

Map端Join

  1. 在mapper端进行join重写setup方法和map方法

    package com.bigdata.mapreduce.mapjoin;
    
    import org.apache.hadoop.fs.FSDataInputStream;
    import org.apache.hadoop.fs.FileSystem;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    
    import java.io.BufferedReader;
    import java.io.FileReader;
    import java.io.IOException;
    import java.io.InputStreamReader;
    import java.net.URI;
    import java.util.HashMap;
    
    public class mj_mapper extends
            Mapper<LongWritable,Text,Text, Text> {
        private HashMap<String, String> map = new HashMap<>();
        //1. 将分布式缓存的小表数据读取到本地map集合
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            //1. 获取分布式缓存列表
            URI[] cacheFiles = context.getCacheFiles();
            //2. 获取指定的分布式缓存文件的文件系统
            FileSystem fileSystem = FileSystem.get(cacheFiles[0], context.getConfiguration());
            //3. 获取文件的输入流
            FSDataInputStream open = fileSystem.open(new Path(cacheFiles[0]));
            //4. 读取文件内容,并存入map集合
            //字节输入流转换为缓存字符流
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(open));
            //读取数据存入map集合中
            String line = null;
            while((line = bufferedReader.readLine()) != null) {
                String[] split = line.split(",");
                map.put(split[0],line);
            }
            //5. 关闭流
            bufferedReader.close();
            fileSystem.close();
        }
        //2. 对大表的处理,实现大小表join
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            //大表处理
            String[] split = value.toString().split(",");
            String s = split[2];
            //大小表join
            String Linedata = map.get(s);
            String data = Linedata + "\t" + value.toString();
            context.write(new Text(s),new Text(data));
    
        }
    }
    
  2. jobmain

    package com.bigdata.mapreduce.mapjoin;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.conf.Configured;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
    import org.apache.hadoop.util.Tool;
    import org.apache.hadoop.util.ToolRunner;
    
    import java.net.URI;
    
    public class mj_jobmain extends Configured implements Tool {
        @Override
        public int run(String[] strings) throws Exception {
            //获取job对象
            Job job = Job.getInstance(super.getConf(), "maphoin");
            //设置job对象
            //将小表放在分布式缓存中
            job.addCacheFile(new URI("hdfs://"));
            //设置输入类和输入路径
            job.setInputFormatClass(TextInputFormat.class);
            TextInputFormat.addInputPath(job, new Path("file:///D:\\mapreduce_demo\\double\\orders.txt"));
            //设置mapper类
            job.setMapperClass(mj_mapper.class);
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Text.class);
            //分区、排序、规约、分组
            //设置reducer类
    
            //设置输出类和路径
            job.setOutputFormatClass(TextOutputFormat.class);
            TextOutputFormat.setOutputPath(job,new Path("file:///D:\\mapreduce_demo\\mapjoin"));
            //等待任务结束
            boolean flag = job.waitForCompletion(true);
            return flag?0:1;
    
        }
    
        public static void main(String[] args) throws Exception {
            Configuration entries = new Configuration();
            int run = ToolRunner.run(entries, new mj_jobmain(), args);
            System.exit(run);
        }
    }
    
    

案例

需求:求两个人的共同好友

原数据:

A:B,C,D,F,E,O
B:A,C,E,K
C:F,A,D,I
D:A,E,F,L
E:B,C,D,M,L
F:A,B,C,D,E,O,M
G:A,C,D,E,F
H:A,C,D,E,O
I:A,O
J:B,O
K:A,C,D
L:D,E,F
M:E,F,G
O:A,H,I,J

思路:写两个mapreduce完成需求

第一个:mapper:将冒号左边的数据作为V2,将冒号右边的每一个数据都做为K2,将数据转换为:

在这里插入图片描述

reducer:相同的K2的数据组成一个集合,遍历集合将数据用**”—“**连接起来:

在这里插入图片描述

并将K2作为V3,连接后的数据作为K3:
在这里插入图片描述

第二个:mapper,将第一个mapreduce的输出数据读取,按\t分割将map1的K3两两连接形成K2,map1的V3作为V2:
在这里插入图片描述

reducer:将map2的K2作为K3,遍历map2的V2将数据用**”-“**连接起来
在这里插入图片描述

代码:

  1. mapper1

    package com.bigdata.mapreduce.firends;
    
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    
    import java.io.IOException;
    
    public class setpMapper1 extends
            Mapper<LongWritable, Text,Text,Text> {
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            //以冒号拆分数据,0索引就是V2
            String[] split = value.toString().split(":");
            //以逗号拆分1索引数据,遍历列表,每个元素就是K2
            for (String i : split[1].split(",")){
                context.write(new Text(i),new Text(split[0]));
            }
        }
    }
    
  2. reducer1

    package com.bigdata.mapreduce.firends;
    
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;
    
    import java.io.IOException;
    
    public class setpReducer1 extends
            Reducer<Text,Text,Text,Text> {
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            StringBuilder builder = new StringBuilder();
            for (Text value : values){
                builder.append(value).append("-");
            }
            int length = builder.toString().length()-1;
            String k3 = builder.toString().substring(0, length);
            context.write(new Text(k3),key);
        }
    }
    
    
  3. jobmain1

    package com.bigdata.mapreduce.firends;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.conf.Configured;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
    import org.apache.hadoop.util.Tool;
    import org.apache.hadoop.util.ToolRunner;
    
    public class septjobmain1 extends Configured implements Tool {
        @Override
        public int run(String[] strings) throws Exception {
            Job job = Job.getInstance(super.getConf(), "sept1");
            job.setInputFormatClass(TextInputFormat.class);
            TextInputFormat.addInputPath(job,new Path("file:///D:\\mapreduce_demo\\input\\firends.txt"));
            job.setMapperClass(setpMapper1.class);
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Text.class);
            job.setReducerClass(setpReducer1.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);
            job.setOutputFormatClass(TextOutputFormat.class);
            TextOutputFormat.setOutputPath(job,new Path("file:///D:\\mapreduce_demo\\firends"));
            boolean flag = job.waitForCompletion(true);
            return flag?0:1;
        }
    
        public static void main(String[] args) throws Exception {
            Configuration entries = new Configuration();
            int run = ToolRunner.run(entries, new septjobmain1(), args);
            System.exit(run);
        }
    }
    
    
  4. mapper2

    package com.bigdata.mapreduce.firends;
    
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    
    import java.io.IOException;
    import java.lang.reflect.Array;
    import java.util.Arrays;
    
    public class septmapper2 extends
            Mapper<LongWritable, Text,Text,Text> {
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            //拆分行文本数据并对数据进行排序
            String[] split1 = value.toString().split("\t");
            String[] split = split1[0].split("-");
            Arrays.sort(split);
            //将数据两两组合并写入上下文数据
            for (int i = 0; i < split.length ; i++) {
                for (int j = i+1; j < split.length ; j++) {
                    String k2 = split[i] + "-" + split[j];
                    context.write(new Text(k2),new Text(split1[1]));
                }
            }
        }
    }
    
    
  5. reducer2

    package com.bigdata.mapreduce.firends;
    
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;
    
    import java.io.IOException;
    
    public class septreducer2 extends
            Reducer<Text,Text,Text,Text> {
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            StringBuilder builder = new StringBuilder();
            for (Text value : values){
                builder.append(value).append("-");
            }
            int length = builder.toString().length()-1;
            String k3 = builder.toString().substring(0, length);
            context.write(key,new Text(k3));
        }
    }
    
    
  6. jobmain2

    package com.bigdata.mapreduce.firends;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.conf.Configured;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
    import org.apache.hadoop.util.Tool;
    import org.apache.hadoop.util.ToolRunner;
    
    public class septjobmain2 extends Configured implements Tool {
    
        @Override
        public int run(String[] strings) throws Exception {
            Job job = Job.getInstance(super.getConf(), "sept2");
            job.setInputFormatClass(TextInputFormat.class);
            TextInputFormat.addInputPath(job,new Path("file:///D:\\mapreduce_demo\\firends\\part-r-00000"));
            job.setMapperClass(septmapper2.class);
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Text.class);
            job.setReducerClass(septreducer2.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);
            job.setOutputFormatClass(TextOutputFormat.class);
            TextOutputFormat.setOutputPath(job,new Path("file:///D:\\mapreduce_demo\\firends2"));
            boolean flag = job.waitForCompletion(true);
            return flag?0:1;
        }
    
        public static void main(String[] args) throws Exception {
            Configuration entries = new Configuration();
            int run = ToolRunner.run(entries, new septjobmain2(), args);
            System.exit(run);
        }
    }
    
    

MapReduce知识点(1)
MapReduce知识点(2)
MapReduce知识点(3)
MapReduce知识点(4)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值