大数据学习心得-2020-09-19--MR案例三枚(倒排索引、top10、寻找共同好友)

案例一:倒排索引案例(多job串联)

  • 需求

    有大量的文本,需要建立搜索索引

    • 数据输入

      • 文件1

        E:\work\test\input\II\a.txt

        Remilya Scarlet jiejie
        Frandre Scarlet meimei
        Scarlet
        
      • 文件2

        E:\work\test\input\II\b.txt

        Remilya Scarlet
        weiyan weiyan weiyan weiyan
        weiyan weiyan weiyan weiyan
        weiyan weiyan weiyan weiyan
        jiejie jiejie jiejie
        
      • 文件3

        E:\work\test\input\II\c.txt

        Frandre Scarlet meimei
        meimei meimei meimei
        Scarlet
        Scarlet
        
    • 期望输出

      • 每一个单词为输出的键,其在每个文件中出现的次数与文件名连接后全部拼接为值

        Frandre	a.txt---1,c.txt---1,
        Remilya	a.txt---1,b.txt---1,
        Scarlet	c.txt---3,b.txt---1,a.txt---3,
        jiejie	b.txt---3,a.txt---1,
        meimei	a.txt---1,c.txt---4,
        weiyan	b.txt---12,c.txt---1,a.txt---2,
        
  • 代码实现

    mapper1:

    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.lib.input.FileSplit;
    import java.io.IOException;
    
    /**
     * 进行稍微有所变动的wordCount即可
     */
    public class Mapper1 extends
            Mapper<LongWritable, Text,Text, IntWritable> {
        private String fileName;
    
        /**
         * 获取文件名
         */
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            FileSplit split = (FileSplit) context.getInputSplit();
            fileName = split.getPath().getName();
        }
    
        private Text k_out = new Text();
        private IntWritable v_out = new IntWritable(1);
    
        /**
         * 只要切割输入字符串,将每个单词后拼接上文件名即可作为键输出
         */
        @Override
        protected void map(LongWritable key,
                           Text value, Context context)
                throws IOException, InterruptedException {
            String[] words = value.toString().split(" ");
            for (String word : words) {
                k_out.set(word+"-"+fileName);
                context.write(k_out,v_out);
            }
        }
    }
    

    reducer1

    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;
    
    import java.io.IOException;
    
    /**
     * 累加即可
     */
    public class Reducer1 extends Reducer<Text, IntWritable,Text,IntWritable> {
        private IntWritable v_out = new IntWritable();
        /**
         * 累加即可
         */
        @Override
        protected void reduce(Text key,
                              Iterable<IntWritable> values, Context context)
                throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable value : values) {
                sum += value.get();
            }
            v_out.set(sum);
            context.write(key,v_out);
        }
    }
    

    mapper2

    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    
    import java.io.IOException;
    
    /**
     * 采用k-v输入格式,通过"-"切割k-v
     */
    public class Mapper2 extends Mapper<Text, Text, Text, Text> {
        /**
         * map直接用父类实现直接写出也可以
         * 这里修改一下value的连接符,不影响业务
         */
        private Text v_out = new Text();
        @Override
        protected void map(Text key,
                           Text value, Context context)
                throws IOException, InterruptedException {
            //修改value连接符
            String[] v = value.toString().split("\t");
            v_out.set(v[0]+"---"+v[1]);
            context.write(key,v_out);
        }
    }
    

    reducer2

    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;
    
    import java.io.IOException;
    
    /**
     * 拼接同一个键的值
     * 使达到预期效果
     */
    public class Reducer2 extends Reducer<Text, Text, Text, Text> {
        /**
         * 拼接值
         */
        private Text v_out = new Text();
        @Override
        protected void reduce(Text key,
                              Iterable<Text> values, Context context)
                throws IOException, InterruptedException {
            StringBuffer sb = new StringBuffer();
            boolean first = true;
            for (Text value : values) {
            sb.append(value.toString()+",");
            }
            v_out.set(sb.toString());
            context.write(key,v_out);
        }
    }
    

    driver

    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.FileSystem;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
    import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob;
    import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    import java.io.IOException;
    
    /**
     * 在driver中设置
     */
    public class MyDriver {
        public static void main(String[] args) throws IOException, InterruptedException {
            //设置输入输出路径
            Path input = new Path("e:/work/test/input/II");
            Path output1 = new Path("e:/work/test/output1");
            Path output2 = new Path("e:/work/test/output2");
    
            //两份配置文件
            Configuration conf1 = new Configuration();
            Configuration conf2 = new Configuration();
            //设置job2的k-vinputformat的分隔符
            conf2.set("mapreduce.input.keyvaluelinerecordreader.key.value.separator","-");
    
            //删除可能存在的输出路径
            FileSystem fs = FileSystem.get(conf1);
            if(fs.exists(output1)){
                fs.delete(output1,true);
            }
            if(fs.exists(output2)){
                fs.delete(output2,true);
            }
    
            //获取俩job
            Job job1 = Job.getInstance(conf1);
            Job job2 = Job.getInstance(conf2);
    
            /**************************************************/
            //设置job1的信息
            job1.setMapperClass(Mapper1.class);
            job1.setReducerClass(Reducer1.class);
    
            job1.setOutputKeyClass(Text.class);
            job1.setOutputValueClass(IntWritable.class);
    
            FileInputFormat.setInputPaths(job1,input);
            FileOutputFormat.setOutputPath(job1,output1);
            /**************************************************/
            //设置job2的信息
            job2.setMapperClass(Mapper2.class);
            job2.setReducerClass(Reducer2.class);
    
            job2.setOutputKeyClass(Text.class);
            job2.setOutputValueClass(Text.class);
    
            job2.setInputFormatClass(KeyValueTextInputFormat.class);
    
            FileInputFormat.setInputPaths(job2,output1);
            FileOutputFormat.setOutputPath(job2,output2);
            /**************************************************/
    
            //设置job名称
            job1.setJobName("job1");
            job2.setJobName("job2");
    
            //将两个任务关联起来
            JobControl jobControl = new JobControl("jobs");
    
            //创建controlledJob,保证两个job的同步关系
            ControlledJob controlledJob1 = new ControlledJob(job1.getConfiguration());
            ControlledJob controlledJob2 = new ControlledJob(job2.getConfiguration());
    
            //将两个job连接,保证controlledJob2在controlledJob1之后运行
            controlledJob2.addDependingJob(controlledJob1);
    
            jobControl.addJob(controlledJob1);
            jobControl.addJob(controlledJob2);
    
            //运行job Control1
            Thread jobControlThread = new Thread(jobControl);
            //设置此线程为守护线程
            jobControlThread.setDaemon(true);
            jobControlThread.start();
            while (true){
                if(jobControl.allFinished()){
                    return;
                }
            }
        }
    }
    

案例二:TOPN案例

  • 需求

    • 已知输入数据为

      E:\work\test\input\top10\top10inputfile.txt

      13470253144	180	180	360
      13509468723	7335	110349	117684
      13560439638	918	4938	5856
      13568436656	3597	25635	29232
      13590439668	1116	954	2070
      13630577991	6960	690	7650
      13682846555	1938	2910	4848
      13729199489	240	0	240
      13736230513	2481	24681	27162
      13768778790	120	120	240
      13846544121	264	0	264
      13956435636	132	1512	1644
      13966251146	240	0	240
      13975057813	11058	48243	59301
      13992314666	3008	3720	6728
      15043685818	3659	3538	7197
      15910133277	3156	2936	6092
      15959002129	1938	180	2118
      18271575951	1527	2106	3633
      18390173782	9531	2412	11943
      84188413	4116	1432	5548
      

      每列分别代表着手机号、上行、下行、总流量

      现要求使用流量最多的top10

    • 采用降序排序,输出时使用一个count计数器只输出前十的数据即可

  • 代码

    FlowBean(setter、getter生成器略

    import org.apache.hadoop.io.WritableComparable;
    
    import java.io.DataInput;
    import java.io.DataOutput;
    import java.io.IOException;
    
    /**
     * FlowBean实现WritableComparable接口,使得其可排序
     */
    public class FlowBean implements WritableComparable {
        private long upFlow;
        private long downFlow;
        private long sumFlow;
    
        /**
         * 实现compareTo方法实现排序
         */
        public int compareTo(Object o) {
            FlowBean bean = (FlowBean) o;
            if(sumFlow > bean.getSumFlow()){
                return -1;
            }else if (sumFlow == bean.getSumFlow()){
                return 0;
            }else{
                return 1;
            }
        }
    
        public void write(DataOutput out) throws IOException {
            out.writeLong(upFlow);
            out.writeLong(downFlow);
            out.writeLong(sumFlow);
        }
    
        public void readFields(DataInput in) throws IOException {
            upFlow = in.readLong();
            downFlow = in.readLong();
            sumFlow = in.readLong();
        }
    
        @Override
        public String toString() {
            return upFlow+"\t"+downFlow+"\t"+sumFlow;
        }
    }
    

    mapper

    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    
    import java.io.IOException;
    
    /**
     * 将可排序的FlowBean直接作为键输出
     * 值为其手机号
     */
    public class TopMapper extends
            Mapper<LongWritable, Text,FlowBean, Text> {
        private FlowBean k_out = new FlowBean();
        private Text v_out = new Text();
        /**
         * map方法将流量数据封装到FlowBean中
         * 然后写出k-v,
         * 由于FlowBean中实现了compareTo方法,所以回按照总流量排序
         */
        @Override
        protected void map(
                LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            String[] line = value.toString().split("\t");
            v_out.set(line[0]);
            k_out.setUpFlow(Long.parseLong(line[1]));
            k_out.setDownFlow(Long.parseLong(line[2]));
            k_out.setSumFlow(Long.parseLong(line[3]));
            context.write(k_out,v_out);
        }
    }
    
    

    redcuer

    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;
    
    import java.io.IOException;
    
    /**
     * reduce类中申请一个成员变量count,当count为10后不再输出数据即可
     * 如果需要解决并列问题,修改count计数在循环和判断语句间的位置即可
     */
    public class TopReducer extends
            Reducer<FlowBean, Text,Text,FlowBean> {
        private int count;
    
        @Override
        protected void setup(Context context)
                throws IOException, InterruptedException {
            count = 0;
        }
    
        @Override
        protected void reduce(
                FlowBean key, Iterable<Text> values, Context context)
                throws IOException, InterruptedException {
            //遍历以免出现总流量相同的情况
            for (Text value : values) {
                if(count < 10){
                    context.write(value,key);
                    count++;
                }
            }
        }
    }
    

    driver设置略

  • 运行结果:

    13509468723	7335	110349	117684
    13975057813	11058	48243	59301
    13568436656	3597	25635	29232
    13736230513	2481	24681	27162
    18390173782	9531	2412	11943
    13630577991	6960	690	7650
    15043685818	3659	3538	7197
    13992314666	3008	3720	6728
    15910133277	3156	2936	6092
    13560439638	918	4938	5856
    

    符合预期,案例完成

案例三:计算共同好友

  • 需求

    • 有如下好友数据,冒号前为该用户,冒号后为该用户的好友。此处的好友为单向好友

    • 输入数据

      E:\work\test\input\friends\friends.txt

      A:B,C,D,F,E,O
      B:A,C,E,K
      C:F,A,D,I
      D:A,E,F,L
      E:B,C,D,M,L
      F:A,B,C,D,E,O,M
      G:A,C,D,E,F
      H:A,C,D,E,O
      I:A,O
      J:B,O
      K:A,C,D
      L:D,E,F
      M:E,F,G
      O:A,H,I,J
      

      要求求出两两相对用户的共同好友,预期输出数据为:

      A-B	E C 
      A-C	D F 
      A-D	E F 
      A-E	D B C 
      A-F	O B C D E 
      A-G	F E C D 
      A-H	E C D O 
      A-I	O 
      A-J	O B 
      A-K	D C 
      A-L	F E D 
      A-M	E F 
      B-C	A 
      B-D	A E 
      B-E	C 
      B-F	E A C 
      B-G	C E A 
      B-H	A E C 
      B-I	A 
      B-K	C A 
      B-L	E 
      B-M	E 
      B-O	A 
      C-D	A F 
      C-E	D 
      C-F	D A 
      C-G	D F A 
      C-H	D A 
      C-I	A 
      C-K	A D 
      C-L	D F 
      C-M	F 
      C-O	I A 
      D-E	L 
      D-F	A E 
      D-G	E A F 
      D-H	A E 
      D-I	A 
      D-K	A 
      D-L	E F 
      D-M	F E 
      D-O	A 
      E-F	D M C B 
      E-G	C D 
      E-H	C D 
      E-J	B 
      E-K	C D 
      E-L	D 
      F-G	D C A E 
      F-H	A D O E C 
      F-I	O A 
      F-J	B O 
      F-K	D C A 
      F-L	E D 
      F-M	E 
      F-O	A 
      G-H	D C E A 
      G-I	A 
      G-K	D A C 
      G-L	D F E 
      G-M	E F 
      G-O	A 
      H-I	O A 
      H-J	O 
      H-K	A C D 
      H-L	D E 
      H-M	E 
      H-O	A 
      I-J	O 
      I-K	A 
      I-O	A 
      K-L	D 
      K-O	A 
      L-M	E F
      
  • 思路

    • 思路:
      • 首先分割数据,建立好友-用户的k-v对,然后合并为好友-用户们输出
      • 然后再在作为值得用户们中两两配对,输出。
      • 最后归并数据即可
  • 代码

    mapper1

    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    
    import java.io.IOException;
    
    /**
     * mapper1负责将用户:好友数据切割
     * 并以好友-用户形式得k-v对输出
     */
    public class Mapper1 extends
            Mapper<LongWritable, Text, Text, Text> {
        private Text k_out = new Text();
        private Text v_out = new Text();
    
        @Override
        protected void map(
                LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            //切割行
            String[] line = value.toString().split(":");
            //切割键的部分
            String[] friends = line[1].split(",");
            k_out.set(line[0]);
            for (String friend : friends) {
                v_out.set(friend.toString());
                context.write(k_out,v_out);
            }
        }
    }
    

    reducer1

    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;
    
    import java.io.IOException;
    
    /**
     * reducer1负责把mapper1分割处理好的数据进行归并,
     */
    public class Reducer1 extends
            Reducer<Text, Text, Text, Text> {
        private Text v_out = new Text();
        @Override
        protected void reduce(
                Text key, Iterable<Text> values, Context context)
                throws IOException, InterruptedException {
            String temp = "";
            //连接“好友”数据对应的所有用户
            for (Text value : values) {
                temp += value.toString() + ",";
            }
            v_out.set(temp);
            //输出
            context.write(key,v_out);
        }
    }
    

    mapper2

    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    
    import java.io.IOException;
    import java.util.Arrays;
    
    /**
     * 同样使用kvinputformat
     * k为好友,v为对应的所有用户,由\t分割
     * 此步将k对应的所有v两两对应
     */
    public class Mapper2 extends
            Mapper<LongWritable, Text, Text, Text> {
        private Text k_out = new Text();
        private Text v_out = new Text();
        @Override
        protected void map(
                LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            //切割行
            String[] line = value.toString().split("\t");
            //设置value
            v_out.set(line[0]);
            String[] users = line[1].split(",");
            //使用户变得有序,方便后续组合不会出现因为次序颠倒而两两相悖的情况
            Arrays.sort(users);
            for (int i = 0; i < users.length; i++) {
                for (int j = i+1; j < users.length; j++) {
                    //两两拼接
                    k_out.set(users[i]+"-"+users[j]);
                    context.write(k_out,v_out);
                }
            }
        }
    }
    

    reducer2

    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;
    
    import java.io.IOException;
    
    /**
     * 负责最终的数据颠倒写出
     */
    public class Reducer2 extends
            Reducer<Text, Text, Text, NullWritable> {
        private Text k_out = new Text();
        @Override
        protected void reduce(
                Text key, Iterable<Text> values, Context context)
                throws IOException, InterruptedException {
            String temp = key.toString() + " ";
            for (Text value : values) {
                temp += value.toString() + " ";
            }
            k_out.set(temp);
            context.write(k_out,NullWritable.get());
        }
    }
    

    driver

    *略

  • 输出结果:

    A-B F 
    A-C F B K G H 
    A-D F G H C K 
    A-E B F G H D 
    A-F G C D 
    A-H O 
    A-I O C 
    A-J O 
    A-K B 
    A-L D 
    A-M F 
    A-O I F H 
    B-C A F E 
    B-D F E A 
    B-E F A 
    B-F A 
    B-L E 
    B-M E F 
    B-O A F J 
    C-D E G A K F H 
    C-E B A F H G 
    C-F G A 
    C-K B 
    C-L E 
    C-M E F 
    C-O F A H 
    D-E F L H A G 
    D-F G A L C 
    D-I C 
    D-L E 
    D-M E F 
    D-O A F H 
    E-F M L D G A 
    E-G M 
    E-K B 
    E-L D 
    E-M F 
    E-O A F H 
    F-G M 
    F-I C 
    F-L D 
    F-O A 
    H-I O 
    H-J O 
    I-J O 
    L-M E 
    M-O F 
    

    符合预期

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值