MapReduce知识点(4)

OutputFormat

自定义OutputFormat

  1. 继承FileOutputFormat,重写getRecordWriter

  2. 自定义RecordWriter,重写write,close方法

  3. MyoutputFormat类

    package com.bigdata.mapreduce.outputformat;
    
    import org.apache.hadoop.fs.FSDataOutputStream;
    import org.apache.hadoop.fs.FileSystem;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    
    import org.apache.hadoop.mapred.JobConf;
    import org.apache.hadoop.mapred.RecordWriter;
    import org.apache.hadoop.mapreduce.TaskAttemptContext;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.util.Progressable;
    
    import java.io.IOException;
    
    public class MyOutputFormat extends FileOutputFormat<Text, NullWritable> {
    
        @Override
        public org.apache.hadoop.mapreduce.RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
            //1. 获取目标文件输出流(两个)
            FileSystem fileSystem = FileSystem.get(taskAttemptContext.getConfiguration());
            FSDataOutputStream goodoutput = fileSystem.create(new Path("file:///D:/mapreduce_demo/good_comments/goods.txt"));
            FSDataOutputStream badoutput = fileSystem.create(new Path("file:///D:/mapreduce_demo/bad_comments/bads.txt"));
    
            //2. 将输出流传给MyRecordWriter
            MyRecordWriter myRecordWriter = new MyRecordWriter(goodoutput, badoutput);
            return myRecordWriter;
        }
    }
    
    
  4. MyRecordWriter类

    package com.bigdata.mapreduce.outputformat;
    
    import org.apache.hadoop.fs.FSDataOutputStream;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.RecordWriter;
    import org.apache.hadoop.mapreduce.TaskAttemptContext;
    import org.apache.jute.Record;
    
    
    import java.io.IOException;
    
    public class MyRecordWriter extends RecordWriter<Text, NullWritable> {
    
        private FSDataOutputStream goodoutput;
        private FSDataOutputStream badoutput;
    
        public MyRecordWriter(FSDataOutputStream goodoutput, FSDataOutputStream badoutput) {
            this.goodoutput = goodoutput;
            this.badoutput = badoutput;
        }
    
        /**
         *
         * @param text 行文本内容
         * @param nullWritable
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        public void write(Text text, NullWritable nullWritable) throws IOException, InterruptedException {
            //1. 获取原数据的第9个字段,为评论值
            String[] split = text.toString().split("\t");
            String numStr = split[9];
            //2. 根据字段值,判断评论类型(好评、中评、差评),然后将对应的数据写入不同的文件夹中
            if (Integer.parseInt(numStr) <= 1){
                //好评或中评
                goodoutput.write(text.toString().getBytes());
                goodoutput.write("\r\n".getBytes());
            }else{
                //差评
                badoutput.write(text.toString().getBytes());
                badoutput.write("\r\b".getBytes());
            }
    
        }
    
        @Override
        public void close(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
            goodoutput.close();
            badoutput.close();
    
        }
    
    }
    
    
  5. mapper

    package com.bigdata.mapreduce.outputformat;
    
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    
    import java.io.IOException;
    
    public class outputmapper extends
            Mapper<LongWritable, Text,Text, NullWritable> {
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            context.write(value,NullWritable.get());
        }
    }
    
    
  6. jobmain

    package com.bigdata.mapreduce.outputformat;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.conf.Configured;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    import org.apache.hadoop.util.Tool;
    import org.apache.hadoop.util.ToolRunner;
    
    public class outputmain extends Configured implements Tool {
        @Override
        public int run(String[] strings) throws Exception {
            Job job = Job.getInstance(super.getConf(), "outputmain");
            job.setInputFormatClass(TextInputFormat.class);
            TextInputFormat.addInputPath(job,new Path("file:///"));
            job.setMapperClass(outputmapper.class);
            job.setMapOutputKeyClass(Text.class);
            job.setOutputValueClass(NullWritable.class);
            //使用自定义outputformat类
            job.setOutputFormatClass(MyOutputFormat.class);
            MyOutputFormat.setOutputPath(job,new Path("file:///D:/mapreduce_demo/out"));
            boolean flag = job.waitForCompletion(true);
            return flag?0:1;
        }
    
        public static void main(String[] args) throws Exception {
            Configuration entries = new Configuration();
            int run = ToolRunner.run(entries, new outputmain(), args);
            System.exit(run);
        }
    }
    
    

分组

需求:求每个订单消费额的Top1

order001    pat01  222.8
order001   pat05  25.8
order002   pat03  522.8
order002   pat04  112.4
order002   pat05  722.4
order003   pat01  222.8
  1. 定义一个orderBean,两个成员变量,订单ID和消费额,继承WritableComparable,重写compareTo方法将相同的订单按金额进行降序排序

    package com.bigdata.mapreduce.group;
    
    import org.apache.hadoop.io.WritableComparable;
    
    import java.io.DataInput;
    import java.io.DataOutput;
    import java.io.IOException;
    
    public class orderBean implements WritableComparable<orderBean> {
    
        private String pid;
        private Double price;
    
        @Override
        public int compareTo(orderBean o) {
            //比较订单ID,ID一致则排序订单金额(降序)
            int i = this.pid.compareTo(o.pid);
            if(i == 0){
                i = o.price.compareTo(this.price);
            }
            return i;
        }
    
        @Override
        public void write(DataOutput dataOutput) throws IOException {
            dataOutput.writeUTF(pid);
            dataOutput.writeDouble(price);
        }
    
        @Override
        public void readFields(DataInput dataInput) throws IOException {
            this.pid = dataInput.readUTF();
            this.price = dataInput.readDouble();
        }
    
        public String getPid() {
            return pid;
        }
    
        public void setPid(String pid) {
            this.pid = pid;
        }
    
        public Double getPrice() {
            return price;
        }
    
        public void setPrice(Double price) {
            this.price = price;
        }
    
        @Override
        public String toString() {
            return  pid + '\t'  + price ;
        }
    }
    
    
  2. 定义mapper类,将订单ID和消费额封装为orderBean对象作为K2,V1作为V2

    package com.bigdata.mapreduce.group;
    
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    
    import java.io.IOException;
    
    public class orderMapper extends
            Mapper<LongWritable, Text,orderBean,Text> {
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            //将行文本数据拆分
            String[] split = value.toString().split("\t");
            //创建orderBean对象,设置订单ID和金额
            orderBean orderBean = new orderBean();
            orderBean.setPid(split[0]);
            orderBean.setPrice(Double.valueOf(split[2]));
            context.write(orderBean,value);
        }
    }
    
    
  3. 创建分区类继承Partitioner重写getPartition方法,将相同商品ID的分到一个区

    package com.bigdata.mapreduce.group;
    
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Partitioner;
    
    import java.util.HashMap;
    
    public class orderPartitions extends
            Partitioner<orderBean, Text> {
    //    private HashMap<String,Integer> hmap = new HashMap<>();
    //    private int num = 0;
        @Override
        public int getPartition(orderBean orderBean, Text text, int i) {
            //方法1
    //        String pid = orderBean.getPid();
    //        if(!hmap.containsKey(pid)){
    //            hmap.put(pid,num);
    //            num++;
    //        }
    //        return hmap.get(pid);
    		//方法2
            return (orderBean.getPid().hashCode() & 2147483647) % i;
        }
    }
    
    
  4. 创建分组类继承WritableComparator类调用父类有参构造,重写compare方法

    package com.bigdata.mapreduce.group;
    
    import org.apache.hadoop.io.WritableComparable;
    import org.apache.hadoop.io.WritableComparator;
    
    /**
     * 1. 继承WriteableComparator
     * 2. 调用父类的有参构造
     * 3. 指定分组规则(重写方法)
     */
    
    public class MyOrder extends WritableComparator {
        //创建无参构造调用父类有参构造
        public MyOrder() {
            //传入orderBean的class,第二个参数表示是否创建orderBean的实例
            super(orderBean.class,true);
        }
    
        //指定分组规则
        @Override
        public int compare(WritableComparable a, WritableComparable b) {
            //对形参做强制类型转换
            orderBean first = (orderBean)a;
            orderBean second = (orderBean)b;
    
            //指定分组规则,两对象商品ID进行比较相同的分在同一组
            return first.getPid().compareTo(second.getPid());
        }
    }
    
    
  5. Reducer类遍历数据取出前n条数据

    package com.bigdata.mapreduce.group;
    
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;
    
    import java.io.IOException;
    
    public class orderReducer extends
            Reducer<orderBean, Text,Text, NullWritable> {
        @Override
        protected void reduce(orderBean key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            int i = 0;
            for (Text value : values){
                context.write(value,NullWritable.get());
                i++;
                if(i >= 1){
                    break;
                }
            }
        }
    }
    
    
  6. jobmain

    package com.bigdata.mapreduce.group;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.conf.Configured;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
    import org.apache.hadoop.util.Tool;
    import org.apache.hadoop.util.ToolRunner;
    
    public class orderjobmain extends Configured implements Tool {
        @Override
        public int run(String[] strings) throws Exception {
            //创建Job对象
            Job job = Job.getInstance(super.getConf(), "orderjob");
            //配置job对象
            //配置输入类和输入路径
            job.setInputFormatClass(TextInputFormat.class);
            TextInputFormat.addInputPath(job,new Path("file:///D:\\mapreduce_demo\\input\\groupdata.txt"));
            //mapper类、K2,V2类型
            job.setMapperClass(orderMapper.class);
            job.setMapOutputKeyClass(orderBean.class);
            job.setMapOutputValueClass(Text.class);
    
            //分区类
            job.setPartitionerClass(orderPartitions.class);
            
    //        job.setNumReduceTasks(3); 不设置默认将数据放在一个文件中
            //分组类
            job.setGroupingComparatorClass(MyOrder.class);
            //reudcer类、K3,V3类型
            job.setReducerClass(orderReducer.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(NullWritable.class);
            //输出类、输出路径
            job.setOutputFormatClass(TextOutputFormat.class);
            TextOutputFormat.setOutputPath(job,new Path("file:///D:\\mapreduce_demo/orderoutput"));
            //等待任务结束
            boolean flag = job.waitForCompletion(true);
            return flag?0:1;
        }
    
        public static void main(String[] args) throws Exception {
            Configuration entries = new Configuration();
            int run = ToolRunner.run(entries, new orderjobmain(), args);
            System.exit(run);
        }
    }
    
    

MapReduce知识点(1)
MapReduce知识点(2)
MapReduce知识点(3)
MapReduce知识点(4)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值