Hadoop期末复习贴-MapReduce

最新推荐文章于 2023-12-28 20:34:51 发布

梏十一郎

最新推荐文章于 2023-12-28 20:34:51 发布

阅读量1k

点赞数 1

分类专栏：大数据期末复习文章标签： hadoop mapreduce big data

本文链接：https://blog.csdn.net/weixin_45629803/article/details/121947892

版权

大数据期末复习专栏收录该内容

6 篇文章 0 订阅

订阅专栏

若本文对你有帮助，请记得点赞、关注我喔！

从头开始看hadoop程序hhhh

1)WordCount

import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class MyMapReducer {
    /**
     * 套Mapper的框架，就是一个约束，约束了输入输出的类型，
     * 输入：<Long,Text>
     * 输出：<Text，IntWritable>
     * context用于存放map处理的结果，也就是输出的工具叫context，也是框架内置的，不用操心。
     * 在Mapper框架下，每读一行就会调用一次map函数
     */
    public static class mapper extends Mapper<LongWritable, Text, Text, IntWritable> {
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            String[] words = line.split(" ");
            for (String word : words) {
                context.write(new Text(word), new IntWritable(1));
            }

        }

    }

    /**
     * 套Reducer的框架，就是一个约束，约束了输入输出的类型，
     * 经过map之后自动的一个combine优化，将同一个Key的value放在同一个组内，形成一个新的<Key,Value>好比(<a,1,1,1,1,1>)
     * 输入：<Text,IntWritable>
     * 输出：<Text，Text>
     * 在Reducer框架下，每读取一个key，调用一次reduce函数
     */
    public static class reducer extends Reducer<Text, IntWritable, Text, Text> {
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int count = 0;
            for (IntWritable value : values) {
                count += value.get();//转化为java类型int
            }
            context.write(key, new Text(count + ""));//加一个""转化为string类型

        }

    }

    /**
     * 处理业务逻辑的整体，叫做job
     * 指定哪个作为mapper，哪个作为reducer
     * 启动，指定一个job
     * @param args
     */

    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        /**
         * job配置Mapper
         * 设置map的输出Key和Value类型
         */
        job.setMapperClass(mapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        /**
         * job配置Reducer
         * 设置reducer输出Key和Value类型
         */
        job.setReducerClass(reducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        //指定原始数据位置
        FileInputFormat.setInputPaths(job,new Path("hdfs://localhost:9000/test"));

        //指定处理的结果数据位置
        FileOutputFormat.setOutputPath(job,new Path("hdfs://localhost:9000/result"));

        System.exit(job.waitForCompletion(true) ? 0 : 1);

    }

}

在这里插入图片描述
2)求商品最低价

/**
 * 比对三家店，挑出物品的最低价
 * 确定总体思路：遍历三家商店信息每一行，此时同种商品的不同价格价格已经形成<Key,Value1,Value2...> 称为Map部分
 * 遍历某一种商品的所有价格，求出最小值，重复直到求得各个商品最小值min_price，称为Reduce部分。
 * 所以根据提出的Map部分，也就能确定Map框架的约束<LongWritable,Text,Text,IntWritable>
 * 再来确定Reduce部分的框架约束，<Text,IntWritable，Text,IntWritable>
 * 最后强调一遍在MapReduce中，Text-String，IntWritable-Int，LongWritable-Long，NullWritable-Null
 */
package MapReduce;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import java.io.IOException;
import java.util.Map;

public class MinPrice {
    /**
     * 通过得到配置文件，再getInstance得到处理业务逻辑整体job
     */
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(MinPrice.class);
        job.setMapperClass(MyMapper.class);
//        job.setMapOutputKeyClass(Text.class);
//        job.setMapOutputValueClass(IntWritable.class);

        /**
         * 输出Text，IntWritable 一个猜测有没有可能根本不需要设置,Map部分的输出
         * 猜想正确，job可以省略map部分的输出类型设置
         *
         * 再次猜想，job是否可以省略reduce部分的输出类型设置
         * 猜想错误：必须有reduce部分的输出类型设置
         */
        job.setReducerClass(MyReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        /**
         * 添加路径
         */
        FileInputFormat.addInputPath(job, new Path("hdfs://localhost:9000/input"));
        FileOutputFormat.setOutputPath(job, new Path("hdfs://localhost:9000/output"));
        System.out.println(job.waitForCompletion(true) ? 0 : 1);


    }

    public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
        private IntWritable price = new IntWritable();
        private Text good = new Text();

        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] strs = value.toString().split(" ");//通过" "，将Id和Price切割
            good.set(strs[0]); //调用set将String->Text
            price.set(Integer.parseInt(strs[1]));//变相String->IntWritable
            context.write(good, price);

        }
    }

    public static class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

        //传入<Key,Value>，Value存在多个值，所以用一个迭代器的形式，Iterable<~>name,表可用一个同类型的变量去循环name
        public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int min = Integer.MAX_VALUE;
            for (IntWritable val : values) {
                if (min > val.get()) { //get让IntWritable到Int，其他同理。
                    min = val.get();
                }
            }
            context.write(key, new IntWritable(min)); //产生新的键值对
        }
    }


}

在这里插入图片描述
3)内容去重复

/**
 * 确定总体思路：Map框架<LongWritable,Text,Text,NullWritable>
 * Reduce框架<Text,NullWritable,Text,NullWritable>
 */
package MapReduce;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import javax.validation.constraints.Null;
import java.io.IOException;
import java.util.Map;


public class QuChong {
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(QuChong.class);

        //Job配置Mapper
        job.setMapperClass(MyMapper.class);

        //job配置Reducer
        job.setReducerClass(MyReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        //指定原始数据
        FileInputFormat.setInputPaths(job, new Path("hdfs://localhost:9000/input"));
        FileOutputFormat.setOutputPath(job, new Path("hdfs://localhost:9000/output"));
        System.out.println(job.waitForCompletion(true) ? 0 : 1);

    }

    public static class MyMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            StringTokenizer itr = new StringTokenizer(value.toString()); //默认用" "来切割字符，完全可以用split来代替
            while (itr.hasMoreTokens()) {
//                word.set(itr.nextToken());
                context.write(new Text(itr.nextToken()), NullWritable.get());
            }
        }
    }

    public static class MyReducer extends Reducer<Text, NullWritable, Text, NullWritable> {
        public void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
            context.write(key, NullWritable.get());
        }
    }

}

在这里插入图片描述
4）整数排序

/**
 * 整体思路：因为Map产生键值对之后，会自动进行Shuffle。Shuffle会默认将键值对升序排序
 * 确定Mapper约束：<LongWritable,Text,Text,NullWritable>
 * Reducer约束:<Text,NullWritable,Text,NullWritable>
 */
package MapReduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import javax.validation.constraints.Null;
import java.io.IOException;
import java.util.StringTokenizer;

public class IntSort {
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        job.setJarByClass(IntSort.class);

        /**
         * 配置Mapper
         */
        job.setMapperClass(MyMapper.class);

        /**
         * 配置Reducer
         */
        job.setCombinerClass(MyReducer.class);
        job.setReducerClass(MyReducer.class);
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(NullWritable.class);

        FileInputFormat.addInputPath(job, new Path("hdfs://localhost:9000/input"));
        FileOutputFormat.setOutputPath(job, new Path("hdfs://localhost:9000/output"));
        System.out.println(job.waitForCompletion(true) ? 0 : 1);

    }

    public static class MyMapper extends Mapper<LongWritable, Text, IntWritable, NullWritable> {
        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            StringTokenizer str = new StringTokenizer(value.toString());
            while (str.hasMoreTokens()) {
                context.write(new IntWritable(Integer.parseInt(str.nextToken())), NullWritable.get());
            }
        }
    }

    public static class MyReducer extends Reducer<IntWritable, NullWritable, IntWritable, NullWritable> {
        public void reduce(IntWritable key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
            /**
             * 可能存在相同整数，所以根据Values中的数量，填充Context。有可能有点懵，下面回顾一下，只回顾一次。
             * 在map中，除了Shuffle外，还有一个combine功能。将所有key相同的键值对，组成一个新的键值对.
             * 那么在这里的map中，我给的value是NullWritable,加入有两个10，就会产生<10,<[空值],[空值]>>
             * 最后job输出context，由于value是空值，就会呈现出整数排序的现象。
             */
            for (NullWritable val : values) {
                context.write(key, NullWritable.get());
            }
        }
    }
}

在这里插入图片描述
5)对象排序
Good.class

package MapReduce.ObjectSort;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class Good implements WritableComparable<Good> {
    private String id;
    private int visitCount;

    public Good() {
    }

    public Good(String id, int visitCount) {
        this.id = id;
        this.visitCount = visitCount;
    }

    //重载
    public String toString() {
        return id + "\t" + visitCount;
    }

    public String getId() {
        return id;
    }

    public int getVisitCount() {return visitCount;}

    /**
     * Good(对象)类型作为Map/Reduce的键类型,要用到序列化和反序列化
     * 序列化:内存对象->字节。反序列化:字节->内存对象
     * 需要在传输中实现，内存对象和字节序列相互转化
     * Hadoop自定义了一套序列化与反序列化。
     *
     * @param out
     */
    //序列化
    public void write(DataOutput out) throws IOException {
        out.writeUTF(id);//尤其注意，字符串序列化，用的是writeUTF
        out.writeInt(visitCount);//writeInt
    }

    //反序列化
    public void readFields(DataInput in) throws IOException {
        id = in.readUTF();
        visitCount = in.readInt();
    }

    //this为指定的数，another为参数。return
    public int compareTo(Good another) {
        if (this.visitCount > another.visitCount) return -1;
        if (this.visitCount < another.visitCount) return 1;
        return this.id.compareTo(another.id);

    }

}

ObjectSort.class

/**
 * id0 100
 * id1 40
 * id2 90
 * id3 36
 * 整体思路：将<ID,Price>看做Key，NullWritable为Value ，以此来进行Shuffle
 * 确定了Mapper的约束：<LongWritable,Text,Good,NullWritable>
 * 确定了Reducer的约束：<Good,NullWritable,Good,NullWritable>
 * 如果要对几组对象进行排序，要让该类实现WritableComparable该接口
 * 再在该类中重载三个函数
 * void write(DataOutPut out) //序列化，将对象转化为字节
 * void readFields(DataInput in) //反序列化，将字节转化为对象
 * public int compareTo()
 */
package MapReduce.ObjectSort;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import javax.validation.constraints.Null;
import java.io.IOException;

public class ObjectSort {
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(ObjectSort.class);

        //配置Mapper
        job.setMapperClass(MyMapper.class);


        //配置Reducer
        job.setReducerClass(MyReducer.class);
        job.setOutputKeyClass(Good.class);
        job.setOutputValueClass(NullWritable.class);

        //指定路径
        FileInputFormat.addInputPath(job,new Path("hdfs://localhost:9000/input"));
        FileOutputFormat.setOutputPath(job,new Path("hdfs://localhost:9000/output"));
        System.exit(job.waitForCompletion(true) ? 0 : 1);

    }

    /**
     * 自动根据Good中的三个重载，进行Shuffle
     */
    public static class MyMapper extends Mapper<LongWritable, Text, Good, NullWritable> {
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] strs = value.toString().split(" ");
            Good good = new Good(strs[0], Integer.parseInt(strs[1]));
            context.write(good, NullWritable.get());
        }
    }

    public static class MyReducer extends Reducer<Good, NullWritable, Good, NullWritable> {
        protected void reduce(Good key,Iterable<NullWritable>values,Context context) throws IOException, InterruptedException {
            context.write(key,NullWritable.get());
        }
    }
}