2022-07-13 MapReduce应用

一、表连接操作(Join)

  1. Reduce Join
    (1) 缺点:容易造成数据倾斜。当造成数据倾斜以后,需要根据具体的业务场景进行更加细致的分区。
    (2) 案例:
                场景:order.txt中已经存储多条订单信息,product.txt存储了产品信息,现在要求将两个文件中的信息连接起来。

    # 每个文件中的各字段都用制表符分割
    
    # order.txt文件中的内容
    orderId	pid	amount
    1001	01	2
    
    # product.txt文件中的内容
    pid	pname
    01	小米
    
    # order.txt和product.txt连接后生成的文件中的信息
    orderId	pname amount
    1001	小米   2
    

                代码实现如下:
                引入依赖:

        <dependencies>
            <!-- junit测试依赖坐标 -->
            <dependency>
                <groupId>junit</groupId>
                <artifactId>junit</artifactId>
                <version>4.12</version>
            </dependency>
            <!-- 日志依赖坐标 -->
            <dependency>
                <groupId>org.apache.logging.log4j</groupId>
                <artifactId>log4j-slf4j-impl</artifactId>
                <version>2.12.0</version>
            </dependency>
            <!-- hadoop依赖坐标 -->
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-client</artifactId>
                <version>3.1.3</version>
            </dependency>
        </dependencies>
    
    

                在resources下写配置文件:

    <!-- 配置文件名为:log4j2.xml -->
    <?xml version="1.0" encoding="UTF-8"?>
    <Configuration status="error" strict="true" name="XMLConfig">
        <Appenders>
            <!-- 类型名为Console,名称为必须属性 -->
            <Appender type="Console" name="STDOUT">
                <!-- 布局为PatternLayout的方式,
                输出样式为[INFO] [2018-01-22 17:34:01][org.test.Console]I'm here -->
                <Layout type="PatternLayout"
                        pattern="[%p] [%d{yyyy-MM-dd HH:mm:ss}][%c{10}]%m%n" />
            </Appender>
    
        </Appenders>
    
        <Loggers>
            <!-- 可加性为false -->
            <Logger name="test" level="info" additivity="false">
                <AppenderRef ref="STDOUT" />
            </Logger>
    
            <!-- root loggerConfig设置 -->
            <Root level="info">
                <AppenderRef ref="STDOUT" />
            </Root>
        </Loggers>
    </Configuration>
    
    

                封装ComplexItem

    import org.apache.hadoop.io.Writable;
    
    import java.io.DataInput;
    import java.io.DataOutput;
    import java.io.IOException;
    
    /**
     * 该类用于封装order.txt和product.txt中的各字段
     */
    public class ComplexItem implements Writable {
        // 该对象中的数据来自于哪一个文件
        private String from = "";
    
        private Integer orderId = 0;
        private String pid = "";
        private String pname = "";
        private Integer amount = 0;
    
        // 空参构造方法,必须存在
        public ComplexItem() {
        }
    
        public Integer getOrderId() {
            return orderId;
        }
    
        public String getPid() {
            return pid;
        }
    
        public String getPname() {
            return pname;
        }
    
        public Integer getAmount() {
            return amount;
        }
    
        /**
         * 为该对象中来自order.txt的数据赋值,并且标识该对象存储了来自order的数据
         * @param orderId orderId
         * @param pid pid
         * @param amount amount
         */
        public void setOrderFields(Integer orderId, String pid, Integer amount){
            this.from = "order";
            this.orderId = orderId;
            this.pid = pid;
            this.amount = amount;
        }
    
        /**
         * 为该对象中来自product.txt的数据赋值,并且标识该对象存储了来自product的数据
         * @param pid pid
         * @param pname pname
         */
        public void setProductFields(String pid, String pname){
            this.from = "product";
            this.pid = pid;
            this.pname = pname;
        }
    
        /**
         * 判断该对象中的数据是否来自order.txt
         * @return 来自order.txt => true、否则 => false
         */
        public boolean isFromOrder(){
            return "order".equalsIgnoreCase(from);
        }
    
        @Override
        public void write(DataOutput out) throws IOException {
            out.writeUTF(this.from);
            out.writeInt(this.orderId);
            out.writeUTF(this.pid);
            out.writeUTF(this.pname);
            out.writeInt(this.amount);
        }
    
        @Override
        public void readFields(DataInput in) throws IOException {
            this.from = in.readUTF();
            this.orderId = in.readInt();
            this.pid = in.readUTF();
            this.pname = in.readUTF();
            this.amount = in.readInt();
        }
    
        @Override
        public String toString() {
            return this.orderId + "\t" + this.pname + "\t" + this.amount;
        }
    }
    

    Mapper

    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.InputSplit;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.lib.input.FileSplit;
    
    import java.io.IOException;
    
    public class ComplexItemMapper extends Mapper<Object, Text, Text, ComplexItem> {
        // 当前Mapper处理的该切片所属的文件名称
        private String from;
    
        private Text keyOut = new Text();
        private ComplexItem valueOut = new ComplexItem();
    
    
        /**
         * 该方法只会在每一个切片的Mapper阶段的最开始运行一次
         * @param context 上下文对象
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
        	// 获取切片对象
            InputSplit split = context.getInputSplit();
            // 获取该切片所属文件的文件名称
            from = ((FileSplit)split).getPath().getName();
        }
    
        @Override
        protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            String[] fields = value.toString().split("\t");
            if (from.contains("order")){
                valueOut.setOrderFields(Integer.parseInt(fields[0]), fields[1], Integer.parseInt(fields[2]));
            }else {
                valueOut.setProductFields(fields[0], fields[1]);
            }
            keyOut.set(valueOut.getPid());
            context.write(keyOut, valueOut);
        }
    }
    

    Reducer

    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;
    
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.List;
    
    public class ComplexItemReducer extends Reducer<Text, ComplexItem, ComplexItem, NullWritable> {
        // 缓存每一组key-values中来自order.txt的封装对象
        private List<ComplexItem> orderList = new ArrayList<>();
    
        private NullWritable valueOut = NullWritable.get();
    
        @Override
        protected void reduce(Text key, Iterable<ComplexItem> values, Context context) throws IOException, InterruptedException {
            ComplexItem product = new ComplexItem();
            for (ComplexItem value : values) {
                if (value.isFromOrder()){
                    ComplexItem order = new ComplexItem();
                    order.setOrderFields(value.getOrderId(), value.getPid(), value.getAmount());
                    orderList.add(order);
                }else {
                    product.setProductFields(value.getPid(), value.getPname());
                }
            }
    
            for (ComplexItem order : orderList) {
                order.setProductFields(product.getPid(), product.getPname());
                context.write(order, valueOut);
            }
    
            orderList.clear();
        }
    }
    

    Driver

    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    import java.io.IOException;
    import java.util.Arrays;
    
    public class ComplexItemDriver {
        public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
            Configuration conf = new Configuration();
            Job job = Job.getInstance(conf);
            job.setJarByClass(ComplexItemDriver.class);
            job.setMapperClass(ComplexItemMapper.class);
            job.setReducerClass(ComplexItemReducer.class);
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(ComplexItem.class);
            job.setOutputKeyClass(ComplexItem.class);
            job.setOutputValueClass(NullWritable.class);
            FileInputFormat.setInputPaths(job, Arrays.stream(args).limit(args.length - 1).map(Path::new).toArray(Path[]::new));
            FileOutputFormat.setOutputPath(job, new Path(args[args.length - 1]));
            System.exit(job.waitForCompletion(true) ? 0 : 1);
        }
    }
    
    # 运行方式1:Idea直接运行
    # 运行方式2:在集群上运行
    hadoop.sh jar xxx.jar xxx.xxx.ComplexItemDriver input-path output-path
    
  2. Map Join
    (1) 使用场景:适合一张表十分小、另一张表十分大的情况
    (2) 案例:
                场景:同Reduce Join
                引入依赖:

        <dependencies>
            <!-- junit测试依赖坐标 -->
            <dependency>
                <groupId>junit</groupId>
                <artifactId>junit</artifactId>
                <version>4.12</version>
            </dependency>
            <!-- 日志依赖坐标 -->
            <dependency>
                <groupId>org.apache.logging.log4j</groupId>
                <artifactId>log4j-slf4j-impl</artifactId>
                <version>2.12.0</version>
            </dependency>
            <!-- hadoop依赖坐标 -->
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-client</artifactId>
                <version>3.1.3</version>
            </dependency>
        </dependencies>
    
    

                在resources下写配置文件:

    <!-- 配置文件名为:log4j2.xml -->
    <?xml version="1.0" encoding="UTF-8"?>
    <Configuration status="error" strict="true" name="XMLConfig">
        <Appenders>
            <!-- 类型名为Console,名称为必须属性 -->
            <Appender type="Console" name="STDOUT">
                <!-- 布局为PatternLayout的方式,
                输出样式为[INFO] [2018-01-22 17:34:01][org.test.Console]I'm here -->
                <Layout type="PatternLayout"
                        pattern="[%p] [%d{yyyy-MM-dd HH:mm:ss}][%c{10}]%m%n" />
            </Appender>
    
        </Appenders>
    
        <Loggers>
            <!-- 可加性为false -->
            <Logger name="test" level="info" additivity="false">
                <AppenderRef ref="STDOUT" />
            </Logger>
    
            <!-- root loggerConfig设置 -->
            <Root level="info">
                <AppenderRef ref="STDOUT" />
            </Root>
        </Loggers>
    </Configuration>
    
    

                封装OrderDetail

    /**
     * 用来封装order.txt和product.txt进行Join操作后的字段的类
     */
    public class OrderDetail {
        private Integer orderId = 0;
        private String pname = "";
        private Integer ammount = 0;
    
        public OrderDetail() {
        }
    
        public OrderDetail(Integer orderId, String pname, Integer ammount) {
            this.orderId = orderId;
            this.pname = pname;
            this.ammount = ammount;
        }
    
        public Integer getOrderId() {
            return orderId;
        }
    
        public void setOrderId(Integer orderId) {
            this.orderId = orderId;
        }
    
        public String getPname() {
            return pname;
        }
    
        public void setPname(String pname) {
            this.pname = pname;
        }
    
        public Integer getAmmount() {
            return ammount;
        }
    
        public void setAmmount(Integer ammount) {
            this.ammount = ammount;
        }
    
        public void setAll(Integer orderId, String pname, Integer ammount){
            setOrderId(orderId);
            setPname(pname);
            setAmmount(ammount);
        }
    
        @Override
        public String toString() {
            return orderId + "\t" + pname + "\t" + ammount;
        }
    }
    

                Mapper

    import org.apache.hadoop.fs.FileSystem;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IOUtils;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    
    import java.io.BufferedReader;
    import java.io.IOException;
    import java.io.InputStreamReader;
    import java.net.URI;
    import java.nio.charset.StandardCharsets;
    import java.util.HashMap;
    import java.util.Map;
    
    public class OrderDetailMapper extends Mapper<Object, Text, OrderDetail, NullWritable> {
        // 存储pid和pname的映射
        private Map<String, String> productMapper = new HashMap<>();
    
        private OrderDetail keyOut = new OrderDetail();
        private NullWritable valueOut = NullWritable.get();
    
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            // 获取缓冲在内存中的文件
            URI[] cacheFiles = context.getCacheFiles();
            if (cacheFiles.length > 0){
                URI productURI = cacheFiles[0];
                FileSystem fs = FileSystem.get(context.getConfiguration());
                BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(new Path(productURI)), StandardCharsets.UTF_8));
                // 读取product.txt中的数据,并将其放入到mapper中
                String line = null;
                while ((line = reader.readLine()) != null){
                    String[] fields = line.split("\t");
                    productMapper.put(fields[0], fields[1]);
                }
                // 释放资源
                IOUtils.closeStreams(reader, fs);
            }
        }
    
        @Override
        protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            String[] fields = value.toString().split("\t");
            keyOut.setAll(Integer.parseInt(fields[0]), productMapper.get(fields[1]), Integer.parseInt(fields[2]));
            context.write(keyOut, valueOut);
        }
    }
    

    Driver

    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    import java.io.IOException;
    import java.net.URI;
    import java.util.Arrays;
    
    public class OrderDetailDriver {
        public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
            Configuration conf = new Configuration();
            Job job = Job.getInstance(conf);
            job.setJarByClass(OrderDetailDriver.class);
            // 把order.txt文件加载到内存中使用,使用本地文件系统协议:file:///;如果想用集群上的文件,使用:hdfs://
            job.addCacheFile(URI.create("file:///D:/moudle/hadoop_results/mapred/in/reducejoin/pd.txt"));
            job.setMapperClass(OrderDetailMapper.class);
            // 没有Reducer
            job.setNumReduceTasks(0);
            job.setMapOutputKeyClass(OrderDetail.class);
            job.setMapOutputValueClass(NullWritable.class);
            job.setOutputKeyClass(OrderDetail.class);
            job.setOutputValueClass(NullWritable.class);
            FileInputFormat.setInputPaths(job, Arrays.stream(args).limit(args.length -1).map(Path::new).toArray(Path[]::new));
            FileOutputFormat.setOutputPath(job, new Path(args[args.length - 1]));
            System.exit(job.waitForCompletion(true) ? 0 : 1);
    
        }
    }
    

二、计数器

案例:记录调用map方法的总次数
解决方案:在map方法中添加如下代码

// context.getCounter(组名称, 计数器名称).increment(增加的值)
// 记录会在输出日志中显示
context.getCounter("mapjoin", "mapCount").increment(1);

三、ETL清洗

不需要Reducer阶段

  1. 案例:有一日志文件,每一行都是用空白符(制表符或者空格)分割的众多字段,要求过滤该文件,将字段数大于11的记录过滤出来;

  2. 代码如下:
    引入依赖:

        <dependencies>
            <!-- junit测试依赖坐标 -->
            <dependency>
                <groupId>junit</groupId>
                <artifactId>junit</artifactId>
                <version>4.12</version>
            </dependency>
            <!-- 日志依赖坐标 -->
            <dependency>
                <groupId>org.apache.logging.log4j</groupId>
                <artifactId>log4j-slf4j-impl</artifactId>
                <version>2.12.0</version>
            </dependency>
            <!-- hadoop依赖坐标 -->
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-client</artifactId>
                <version>3.1.3</version>
            </dependency>
        </dependencies>
    
    

    resources下写配置文件:

    <!-- 配置文件名为:log4j2.xml -->
    <?xml version="1.0" encoding="UTF-8"?>
    <Configuration status="error" strict="true" name="XMLConfig">
        <Appenders>
            <!-- 类型名为Console,名称为必须属性 -->
            <Appender type="Console" name="STDOUT">
                <!-- 布局为PatternLayout的方式,
                输出样式为[INFO] [2018-01-22 17:34:01][org.test.Console]I'm here -->
                <Layout type="PatternLayout"
                        pattern="[%p] [%d{yyyy-MM-dd HH:mm:ss}][%c{10}]%m%n" />
            </Appender>
    
        </Appenders>
    
        <Loggers>
            <!-- 可加性为false -->
            <Logger name="test" level="info" additivity="false">
                <AppenderRef ref="STDOUT" />
            </Logger>
    
            <!-- root loggerConfig设置 -->
            <Root level="info">
                <AppenderRef ref="STDOUT" />
            </Root>
        </Loggers>
    </Configuration>
    
    

    Mapper

    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    
    import java.io.IOException;
    
    public class ETLMapper extends Mapper<Object, Text, Text, NullWritable> {
        private Text keyOut = new Text();
        private NullWritable valueOut = NullWritable.get();
    
        @Override
        protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            String[] fields = value.toString().split("\\s");
            if (fields.length > 11){
                keyOut.set(value);
                context.write(keyOut, valueOut);
            }
        }
    }
    

    Driver

    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    import java.io.IOException;
    import java.util.Arrays;
    
    public class ETLDriver {
        public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
            Configuration conf = new Configuration();
            Job job = Job.getInstance(conf);
            job.setJarByClass(ETLDriver.class);
            job.setMapperClass(ETLMapper.class);
            job.setNumReduceTasks(0);
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(NullWritable.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(NullWritable.class);
            FileInputFormat.setInputPaths(job, Arrays.stream(args).limit(args.length -1).map(Path::new).toArray(Path[]::new));
            FileOutputFormat.setOutputPath(job, new Path(args[args.length - 1]));
            System.exit(job.waitForCompletion(true) ? 0 : 1);
        }
    }
    
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值