MR 程序代码

最新推荐文章于 2022-04-20 15:50:49 发布

醉卧千山下，诗酒趁年华。

最新推荐文章于 2022-04-20 15:50:49 发布

阅读量686

点赞数

分类专栏：大数据hadoop 文章标签： mapreduce 大数据

本文链接：https://blog.csdn.net/qq_38999072/article/details/119859296

版权

大数据hadoop 专栏收录该内容

26 篇文章 0 订阅

订阅专栏

一、建立MAVEN工程，在POM.XML中引入JAR包

pom.xml

<dependencies>
        <dependency>
            <groupId>org.apache.logging.log4j</groupId>
            <artifactId>log4j-core</artifactId>
            <version>2.14.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.7.7</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>2.7.7</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.7.7</version>
        </dependency>
</dependencies>

log4j.properties

log4j.rootLogger=INFO, stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] - %m%n

log4j.appender.logFile=org.apache.log4j.FileAppender
log4j.appender.logFile.File=target/spring.log
log4j.appender.logFile.layout=org.apache.log4j.PatternLayout
log4j.appender.logFile.layout.ConversionPattern=%d %p [%c] - %m%n

二、使用步骤

1.WordCount示例

WordMapper

package com.hdfs.mr;



import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class WordMapper extends Mapper<LongWritable, Text,Text, IntWritable> {

    Text k = new Text();
    IntWritable v = new IntWritable(1);

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        String[] words = line.split(" ");


        for (String word:words) {
            k.set(word);
            context.write(k,v);
        }
    }

}

WordReducer

package com.hdfs.mr;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.Iterator;

public class WordReducer extends Reducer <Text, IntWritable ,Text, IntWritable>{

    IntWritable v = new IntWritable();
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        int sum = 0;

        Iterator it = values.iterator();

        while (it.hasNext()){
            sum += Integer.parseInt(it.next().toString());
        }
        v.set(sum);

        context.write(key,v);
    }
}

WordDriver

package com.hdfs.mr;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;


public class WordDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        String[] args1 = new String[2];
        args1[0]= "d:/hello.txt";
        args1[1]="d:/output10";
        Configuration conf = new Configuration();

        // 设置map 输出端压缩
        conf.setBoolean("mapreduce.map.output.compress",true);
        conf.setClass("mapreduce.map.output.compress.codec", BZip2Codec.class, CompressionCodec.class);

        //1, 获取job对象
        Job job = Job.getInstance(conf);

        //2, 设置JAR存储位置
        job.setJarByClass(WordDriver.class);

        //3, 关联Map和Reducer类
        job.setMapperClass(WordMapper.class);
        job.setReducerClass(WordReducer.class);

        //4, 设置Maper阶段输出数据的Key和Value类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        //5, 设置最终输出阶段的Key和Value类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        //8, 设置Combiner 可以把reducer 直接设置进去，如果处理流程一样
        job.setCombinerClass(WordReducer.class);

        //9 设置reduce 输出端压缩
        FileOutputFormat.setCompressOutput(job,true);
        FileOutputFormat.setOutputCompressorClass(job,BZip2Codec.class);


        //6, 设置输入和输出路径
        FileInputFormat.setInputPaths(job,new Path(args1[0]));
        FileOutputFormat.setOutputPath(job,new Path(args1[1]));

        //7, 提交JOB
        job.waitForCompletion(true);
    }
}

2.序列化自定义分区示例

FlowBean

package com.hdfs.mr;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class FlowBean implements Writable {
    private long upload;
    private long download;
    private long sum;


    public FlowBean() {
        super();
    }

    public FlowBean(long upload, long download) {
        super();
        this.upload = upload;
        this.download = download;
        this.sum = upload + download;
    }

    @Override
    public String toString() {
        return upload +"\t" + download +"\t" + sum ;

    }

    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeLong(upload);
        dataOutput.writeLong(download);
        dataOutput.writeLong(sum);

    }

    public void readFields(DataInput dataInput) throws IOException {

        this.upload = dataInput.readLong();
        this.download = dataInput.readLong();
        this.sum = dataInput.readLong();

    }

    public long getUpload() {
        return upload;
    }

    public void setUpload(long upload) {
        this.upload = upload;
    }

    public long getDownload() {
        return download;
    }

    public void setDownload(long download) {
        this.download = download;
    }

    public long getSum() {
        return sum;
    }

    public void setSum(long a, long b) {
        this.sum = a + b;
    }
}

FlowMapper

package com.hdfs.mr;



import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class FlowMapper extends Mapper<LongWritable, Text,Text, FlowBean> {

    Text k = new Text();
    FlowBean flowBean = new FlowBean();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        String[] fileds = line.split("\t");

        flowBean.setUpload(Long.parseLong(fileds[1]));
        flowBean.setDownload(Long.parseLong(fileds[2]));
        flowBean.setSum(Long.parseLong(fileds[1]),Long.parseLong(fileds[2]));

        k.set(fileds[0]);
        context.write(k,flowBean);

    }

}

FlowReducer

package com.hdfs.mr;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class FlowReducer extends Reducer<Text, FlowBean, Text, FlowBean> {

    FlowBean flowBean = new FlowBean();

    @Override
    protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {

        long sumUp = 0;
        long sumDown = 0;

        for (FlowBean ff : values) {
            sumUp += ff.getUpload();
            sumDown += ff.getDownload();
        }
        flowBean.setUpload(sumUp);
        flowBean.setDownload(sumDown);
        flowBean.setSum(sumUp, sumDown);
        context.write(key, flowBean);
    }
}

FlowDriver

package com.hdfs.mr;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;


public class FlowDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        String[] args1 = new String[2];
        args1[0]= "d:/phone.txt";
        args1[1]="d:/output4";
        Configuration conf = new Configuration();

        //1, 获取job对象
        Job job = Job.getInstance(conf);

        //2, 设置JAR存储位置
        job.setJarByClass(FlowDriver.class);

        //3, 关联Map和Reducer类
        job.setMapperClass(FlowMapper.class);
        job.setReducerClass(FlowReducer.class);

        //4, 设置Maper阶段输出数据的Key和Value类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(FlowBean.class);

        //5, 设置最终输出阶段的Key和Value类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(FlowBean.class);

        //8 设置自定义分区（没有可以不设置）
//        job.setPartitionerClass(CustomPartitioner.class);
//        job.setNumReduceTasks(2);

        //6, 设置输入和输出路径
        FileInputFormat.setInputPaths(job,new Path(args1[0]));
        FileOutputFormat.setOutputPath(job,new Path(args1[1]));

        //7, 提交JOB
        job.waitForCompletion(true);
    }
}

CustomPartitioner

package com.hdfs.shuffle;

import com.hdfs.mr.FlowBean;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

public class CustomPartitioner extends Partitioner<Text, FlowBean> {

    public int getPartition(Text text, FlowBean flowBean, int i) {
        // 手机号前3位
        String phoneNo = text.toString().substring(0,3);

        int partition = 1;
        if("138".equals(phoneNo)){
            partition = 0;
        }
        return partition;
    }
}

3.Map Join

1 MapJoinMapper

package com.hdfs.mapjoin;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.*;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;

public class MapJoinMapper extends Mapper<LongWritable, Text,Text, NullWritable> {

    HashMap<String, String> pdMap = new HashMap();
    Text k = new Text();

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {

        Configuration conf = context.getConfiguration();
        FileSystem fs = null;
        try {
            fs = FileSystem.get(new URI("hdfs://s201:9000"),conf,"root");
        } catch (URISyntaxException e) {
            e.printStackTrace();
        }
        //缓存小表
        URI[] cacheFiles = context.getCacheFiles();
        String path=cacheFiles[0].getPath().toString();
        System.out.println("path:============="+ path);
        FSDataInputStream fin = fs.open(new Path(path));
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(fin,"UTF-8"));


//        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(path),"UTF-8"));

        String line;
        while (StringUtils.isNotEmpty(line = bufferedReader.readLine())){

            String[] fileds = line.toString().split(" ");
            pdMap.put(fileds[0],fileds[1]);
        }
        IOUtils.closeStream(fin);
        IOUtils.closeStream(bufferedReader);


    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {


        String[] fileds = value.toString().split(" ");

        // 获取ID
        String pid = fileds[1];
        String pname = pdMap.get(pid);
        System.out.println(pid +"pid==================pname"+ pname);

        String line = fileds[0] +"\t"+ pname +"\t" + fileds[2];

        k.set(line);
        context.write(k ,NullWritable.get());

    }
}

2 MapJoinDriver

package com.hdfs.mapjoin;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;


public class MapJoinDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {

        String[] args1 = new String[2];
        // 这么设置就是HDFS的路径，不是LINUX 系统路径
        args1[0] = "/root/order.txt";
        args1[1] = "/output9";
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://s201:9000");

        //1, 获取job对象
        Job job = Job.getInstance(conf);

        //2, 设置JAR存储位置
        job.setJarByClass(MapJoinDriver.class);

        //3, 关联Map和Reducer类
        job.setMapperClass(MapJoinMapper.class);
//        job.setReducerClass(OrderCompareReducer.class);

        //4, 设置Maper阶段输出数据的Key和Value类型
//        job.setMapOutputKeyClass(Text.class);
//        job.setMapOutputValueClass(NullWritable.class);

        //5, 设置最终输出阶段的Key和Value类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        //8 设置自定义分区（没有可以不设置）（如果分区内排序，就需要分区了,要是全部排序就不能设置分区）
//        job.setPartitionerClass(CustomTowPartitioner.class);
//        job.setNumReduceTasks(2);


        //9 设置分组排序类
//        job.setGroupingComparatorClass(OrderGroupingComparator.class);

        //10 设置缓存文件
        job.addCacheFile(new URI("hdfs://s201:9000/pd.txt"));
        // 这里是linux 系统路径
//        job.addCacheFile(new Path("/root/pd.txt").toUri());
        job.setNumReduceTasks(0);


        //6, 设置输入和输出路径
        FileInputFormat.setInputPaths(job, new Path(args1[0]));
        FileOutputFormat.setOutputPath(job, new Path(args1[1]));

        //7, 提交JOB
        job.waitForCompletion(true);
    }
}

3 数据文件
order.txt

pd.txt

1 小米
2 华为
3 格力

4.分组排序（辅助排序）

OrderGroupingComparator

package com.hdfs.shuffle;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class OrderGroupingComparator extends WritableComparator {
    public OrderGroupingComparator() {
        super(OrderCompareBean.class,true);
    }

    @Override
    public int compare(WritableComparable a, WritableComparable b) {
        OrderCompareBean aBean = (OrderCompareBean) a;
        OrderCompareBean bBean = (OrderCompareBean) b;

        int result;
        if(aBean.getOrder_id() > bBean.getOrder_id()){
            result = 1;
        }else if(aBean.getOrder_id() < bBean.getOrder_id()){
            result = -1;
        }else {
            result = 0;
        }
        return result;
    }
}

醉卧千山下，诗酒趁年华。

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
MR 程序代码

一、建立MAVEN工程，在POM.XML中引入JAR包pom.xml<dependencies> <dependency> <groupId>org.apache.logging.log4j</groupId> <artifactId>log4j-core</artifactId> <version>2.14.0</versio
复制链接

扫一扫