大数据系列（三）以日志为例，利用hadoop原生hdfs和MapReduce分别进行统计

最新推荐文章于 2022-04-04 18:07:57 发布

codemperor

最新推荐文章于 2022-04-04 18:07:57 发布

阅读量525

点赞数

分类专栏：大数据文章标签：大数据 hadoop

本文链接：https://blog.csdn.net/zl592886931/article/details/89789507

版权

大数据专栏收录该内容

13 篇文章 2 订阅

订阅专栏

使用java操作hdfs的api

我们来写几个demo，测试一下api：

package zl.hdfs;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.util.Progressable;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;

import java.io.*;
import java.net.URI;
import java.net.URISyntaxException;

public class HDFSApp {
    private Configuration configuration = null;
    private FileSystem fileSystem = null;

    /**
     * 配置
     *
     * @throws IOException
     * @throws URISyntaxException
     */
    @Before
    public void config() throws IOException, URISyntaxException {
        System.out.println("hdfs config before");
        configuration = new Configuration();
        //这里可以设置副本系数
        configuration.set("dfs.replication", "1");
        //这里针对外网链接测试，设置datanode的hostname通信
        configuration.set("dfs.client.use.datanode.hostname", "true");
        //端口号在 /hadoop/etc/hadoop/core-site.xml中配置
        URI uri = new URI("hdfs://47.96.94.82:9000");
        fileSystem = FileSystem.get(uri, configuration);
    }

    @Test
    public void mkdir() throws IOException {
        Path path = new Path("/hdfsapi/test");
        boolean result = fileSystem.mkdirs(path);
        System.out.println(result);
    }

    /**
     * 删除文件
     *
     * @throws IOException
     */
    @Test
    public void removeFile() throws IOException {
        Path path = new Path("/hdfsapi");
        //递归删除 -r
        boolean result = fileSystem.delete(path, true);
        System.out.println(result);
    }

    /**
     * 查看文件
     *
     * @throws IOException
     */
    @Test
    public void textFile() throws IOException {
        Path path = new Path("/hdfsapi/a.txt");
        FSDataInputStream fsDataInputStream = fileSystem.open(path);
        IOUtils.copyBytes(fsDataInputStream, System.out, 1024);
    }

    /**
     * 写入文件
     *
     * @throws Exception
     */
    @Test
    public void writeFile() throws Exception {
        Path path = new Path("/hdfsapi/a.txt");
        FSDataOutputStream out = fileSystem.create(path);
        out.writeUTF("Hello wahaha ");
        out.flush();
        out.close();
    }

    /**
     * 重命名
     *
     * @throws Exception
     */
    @Test
    public void renameFile() throws Exception {
        Path oldPath = new Path("/hdfsapi/a.txt");
        Path newPath = new Path("/hdfsapi/rename.txt");
        boolean result = fileSystem.rename(oldPath, newPath);
        System.out.println(result);
    }

    /**
     * 上传本地文件到hdfs，带进度条
     *
     * @throws Exception
     */
    @Test
    public void copyLocalFileToHadoop() throws Exception {

        //copy本地一个文件到hdfs，简单的
//        Path oldPath = new Path("/etc/demo/a.txt");
//        Path newPath = new Path("/hdfsapi/rename.txt");
//        fileSystem.copyFromLocalFile(oldPath, newPath);

        // 下面上传文件有进度条
        final int[] i = {0};
        FSDataOutputStream out = fileSystem.create(new Path("/hdfsapi/wuxi-app.zip"), new Progressable() {
            public void progress() {
                i[0]++;
                System.out.println("上传中: " + i[0]);
            }
        });

        InputStream in = new BufferedInputStream(new FileInputStream(new File("/Users/fuck/git/wuxi-app.zip")));

        IOUtils.copyBytes(in, out, 4096);
    }

    @After
    public void hdfsDown() {
        configuration = null;
        fileSystem = null;
    }
}

博主用的是阿里云，而博主一直在本地idea上进行单元测试，所以这里可能在读文件的时候报错：Could not obtain block，或者在写文件的时候报：could only be written to 0 of the 1 minReplication nodes

特别注意下面这行配置，因为博主环境搭建在阿里云，通信使用阿里云内网ip，而博主现在使用本地的idea跑测试，所以这样的情况下，是无法跑通的，在读取和写入文件，会各种报错（这里坑了博主半天的事件排错）。最终博主发现，在写入或者读取的时候，namenode返回的是datanode的内网ip，你当然无法使用内网ip读取datanode的数据，所以加上下面这行代码，并且你要在你的client端（博主是mac电脑）hosts中配置节点信息，放上外网ip映射，这样，就可以跑通了（在生产环境当然不可能有外网ip啦），这里千万注意，本行配置虽然说的是hostname，但是你在本地hosts里面不是真的配置各个机器的hostname，而是你hadoop集群相互通信的域名，比如core-site.xml里面配置的"master",又比如workers里面配置的master、node1、node2（在hosts里面也配置了这三个相对应的ip），参考博主之前的搭建环境的文章。这里博主在本地hosts中设置为：
ip master
ip node1
ip node2

configuration.set("dfs.client.use.datanode.hostname", "true");

hdfs写入数据的流程

上面讲到外网链接hadoop会报错，这一个坑博主在网上找到了不少这样的踩坑记录，大部分都是云上搭建hadoop集群，然后本地链接上去，调用namenode，还可以实现（安全组把端口开出去），但是通过namenode来和datanode交互就变得及其困难，因为集群内，NN与DN都是内网ip通信，我们先来看看一下写文件的过程：

client =>namenode=>client(block请求) =>namenode =>client(得知可写的block) =>datanode => datanode(s) =>client =>datanode
1.首先客户端请求namenode确立通信机制，然后NM返回给客户端，ok，没问题
2.客户端向NM发出写文件请求，NM获取Block以及需要写入的DN节点信息，返回给客户端
3.客户端找到相应的DN，建立通信机制（根据NN返回的DN信息，返回的是DN的内网IP，所以客户端如果在外网，通过内网IP去找DN，肯定是凉凉斯密达），然后返回给客户端，说OJBK了，来把。
4.客户端传输数据

So，各位老爷们，一定记住啦。外网通信到内网，记得使用hadoop集群通信所配置的域名，如果做测试，加入（没必要在配置文件，在代码里临时加一下）：

configuration.set("dfs.client.use.datanode.hostname", "true");
然后你本地的hosts里面加入域名所对应的ip映射关系即可（如果还不明白，就直接加你hadoop服务器上hosts里面配置的域名与ip映射关系）

利用hdfs来实现词频统计

我们来写一段比较原始的，使用hdfs的api来统计wordcount（这里硬干，就不做任何封装了）：
这里有个文件wordcount，内容是：

test hadoop
hello hadoop
hi hdfs
hadoop hdfs
test yarn
hi yarn

我们开始统计每个单词出现的次数：

package codemperor.hdfs;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.IOUtils;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;


/**
 * hdfs wordcount
 * 统计文件中的wordcount，然后结果输出到HDFS
 */
public class HDFSWordCount {

    public static void main(String[] args) throws Exception {
        /**********配置********/
        Configuration configuration = new Configuration();
        configuration = new Configuration();
        //这里可以设置副本系数
        configuration.set("dfs.replication", "1");
        //这里针对外网链接测试，设置datanode的hostname通信
        configuration.set("dfs.client.use.datanode.hostname", "true");
        URI uri = new URI("hdfs://47.96.94.82:9000");
        FileSystem fs = FileSystem.get(uri, configuration);

        /**********缓存********/
        Map<String, Integer> map = new HashMap<>();

        /**********读文件********/
        Path input = new Path("/hdfsapi/wordcount");
        RemoteIterator<LocatedFileStatus> ri = fs.listFiles(input, true);
        while (ri.hasNext()) {
            LocatedFileStatus lf = ri.next();
            FSDataInputStream fsDataInputStream = fs.open(lf.getPath());

            BufferedReader reader = new BufferedReader(new InputStreamReader(fsDataInputStream));

            String line = "";
            while ((line = reader.readLine()) != null) {
                String[] row = line.split(" ");
                for (String v : row) {
                    System.out.println(row[0] + ":" + row[1]);
                    Integer word = map.get(v);
                    if (word == null) {
                        map.put(v, 1);
                    } else {
                        word++;
                        map.put(v, word);
                    }
                }
            }

            reader.close();
            fsDataInputStream.close();
        }

        // System.out.println(map);

        /**********输出结果********/
        Path output = new Path("/hdfsapi/output/wordcount_result");
        FSDataOutputStream out = fs.create(output);

        for (Object key : map.keySet()) {
            out.writeUTF(key.toString() + ":" + map.get(key) + "\n");
        }

        out.close();
        System.out.println("统计成功");

        /**********读取我们的结果文件看看********/
        FSDataInputStream fsDataInputStream = fs.open(new Path("/hdfsapi/output/wordcount_result"));
        IOUtils.copyBytes(fsDataInputStream, System.out, 1024);

        fs.close();
    }
}

使用MapReduce

使用MR来做wordcount

首先我们来了解一下数据到map，然后在到reduce的过程（我们以wordcount为例）：
1.提交数据后，数据经过splitting进行切分，切分成多个数据块（当然是分布式分散在不同节点啦）
2.数据块提交mapping，做KV处理，然后继续下一步
3.Mapping的数据经过Shuffling，将所有相同的key的数据放在同一个块上
4.Reducing将所有相同的key的数据进行整合计算，最终得到结果
那么可以简化为：

现在有hello、hello、hello、world、world、codemperor，经过map之后变为：
（hello，1）（hello，1）（hello，1）（world，1）（world，1）（codemperor，1）
map将数据输出到reduce端，按照相同的key，分发到同一个reduce上，数据变为：
reduce1：(hello,1) (hello,1) (hello,1) => （hello，[1,1,1]）
reduce1：(world,1) (world,1) => （world，[1,1]）
reduce1：(codemperor,1) => （world，[1]）
然后计算后得到最终结果

我们先创建：WordCountMapper：

package codemperor.mapreduce.wc;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * 使用MR来实现wordcount
 * KEYIN: 每行数据的偏移量,比如第一行数据共20个字符，那么第一行偏移量是0，第二行偏移量是20
 * VALUEIN：正行数据
 * KEYOUT: 数据输出的key，比如词频统计，输出的key就是某个单词
 * VALUEOUT：输出的value
 * <p>
 * Mapper后面4个类型不要跟java的类型，而是跟hadoop自带的可序列化以及反序列化的基础类型
 */
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] values = value.toString().split(" ");
        for (String word : values) {
            context.write(new Text(word), new IntWritable(1));
        }
    }
}

然后创建：WordCountReduce：

package codemperor.mapreduce.wc;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.Iterator;

/**
 * wordcount rd
 */
public class WordCountReduce extends Reducer<Text, IntWritable, Text, IntWritable> {

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        Iterator<IntWritable> iterator = values.iterator();

        int count = 0;
        while (iterator.hasNext()) {
            IntWritable value = iterator.next();
            count += value.get();
        }

        context.write(key, new IntWritable(count));
    }
}

最后创建WordCountDriver来执行map和reduce：

package codemperor.mapreduce.wc;

import lombok.extern.slf4j.Slf4j;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * 执行map和reduce
 */
@Slf4j
public class WordCountDriver {
    public static void main(String[] args) throws Exception {
        String HDFSURI = "hdfs://你的IP:9000";
        /**********配置********/
        Configuration configuration = new Configuration();
        //这里针对外网链接测试，设置datanode的hostname通信
        configuration.set("dfs.client.use.datanode.hostname", "true");

        //编写作业
        Job job = Job.getInstance(configuration, "WordCount");
        log.info("asdasd");
        job.setJarByClass(WordCountDriver.class);
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReduce.class);

        //设置map的输出类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        //设置reduce的输出类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        //作业输入和输出的参数
        FileInputFormat.setInputPaths(job, new Path(HDFSURI + "/hdfsapi/wordcount")); //这里注意直接使用hdfs协议全路径
        FileOutputFormat.setOutputPath(job, new Path(HDFSURI + "/wordcount/output"));//这里注意直接使用hdfs协议全路径

        boolean result = job.waitForCompletion(true);
        System.out.println(result);
    }
}

最终查看一下文件结果：
在这里插入图片描述
结果是：

hadoop 3
hdfs 2
hello 1
hi 2
test 2
yarn 2

使用MR进行本地计算

根据上面代码可以得知，我们用java提交了一个作业，发给hadoop，hadoop根据作业指令开始计算，那么MapReduce必须要链接hdfs进行计算吗（Spark瞬间教你做人），当然不是，我们来稍微改一下代码，让它进行本地文件计算（当然是你机器的单节点）, 我们将WordCountDriver里面的：

 //作业输入和输出的参数
FileInputFormat.setInputPaths(job, new Path(HDFSURI + "/hdfsapi/wordcount")); //这里注意直接使用hdfs协议全路径
FileOutputFormat.setOutputPath(job, new Path(HDFSURI + "/wordcount/output"));//这里注意直接使用hdfs协议全路径

修改为

//下面是可以本地执行mr计算本地文件
FileInputFormat.setInputPaths(job, new Path("hdfs-demo/input/wordcount"));
FileOutputFormat.setOutputPath(job, new Path("hdfs-demo/output/result"));

再来执行一下，发现mr计算的是你本地的文件，然后输出在本地的文件夹内。

使用MR来统计日志

我们先来看一段日志样例：
在这里插入图片描述

我们来统计一个简单的：每天请求的成功次数、失败次数
首先，先定义我们自定义bean（这里不使用自带的基本类型）：
这里需要注意：

我们类里面使用类三个int字段，因为是计算默认值，我们来赋给一个初始数据0，博主因为没有初始值，而在mapper里面也没有给所有字段初始化，导致reduce不执行，并且没有日志，所以提醒大家一点

package codemperor.mapreduce.accesslog.dailycount;

import lombok.Getter;
import lombok.Setter;
import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 * 计算每日请求成功量与失败量
 */
@Getter
@Setter
public class DailyCountEntity implements Writable {
    /**
     * 成功次数
     */
    private Integer success = 0;
    /**
     * 失败次数
     */
    private Integer failed = 0;

    /**
     * 解析失败
     */
    private Integer undefined = 0;

    public DailyCountEntity() {
    }

    /**
     * write和readFields执行，必须有值，因为是流的写入写出
     * @param out
     * @throws IOException
     */
    @Override
    public void write(DataOutput out) throws IOException {
        out.writeInt(success);
        out.writeInt(failed);
        out.writeInt(undefined);
    }

    /**
     * 读的顺序必须和上面写的顺序一致
     *
     * @param in
     * @throws IOException
     */
    @Override
    public void readFields(DataInput in) throws IOException {
        this.success = in.readInt();
        this.failed = in.readInt();
        this.undefined = in.readInt();
    }

    @Override
    public String toString() {
        final StringBuilder sb = new StringBuilder("{");
        sb.append("\"success\":")
                .append(success);
        sb.append(",\"failed\":")
                .append(failed);
        sb.append(",\"undefined\":")
                .append(undefined);
        sb.append('}');
        return sb.toString();
    }
}

继续上我们自定义mapper：

package codemperor.mapreduce.accesslog.dailycount;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.log4j.Logger;

import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Locale;

public class DailyCountMapper extends Mapper<LongWritable, Text, Text, DailyCountEntity> {
    public Logger log = Logger.getLogger(DailyCountDriver.class);

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //27.19.74.143 - - [30/May/2013:17:38:20 +0800] "GET /static/image/common/faq.gif HTTP/1.1" 200 1127
        String val = value.toString();
        log.info("[map] value: " + val);
        String reduceKey = dateFormat(val);
        String methodCode = getResult(val).trim();

        DailyCountEntity dailyCountEntity = new DailyCountEntity();
        if ("0".equals(methodCode)) {
            dailyCountEntity.setUndefined(1);
        } else if ("200".equals(methodCode)) {
            dailyCountEntity.setSuccess(1);
        } else {
            dailyCountEntity.setFailed(1);
        }

        context.write(new Text(reduceKey), dailyCountEntity);
    }

    public String dateFormat(String val) {
        String date = val.substring(val.indexOf("[") + 1, val.indexOf("]"));
        SimpleDateFormat in = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss ZZZZZ", Locale.US);
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:00:00");
        try {
            String result = sdf.format(in.parse(date));
            log.info("[dateFormat] date: " + result);
            return result;
        } catch (ParseException e) {
            log.error("", e);
        }
        return val;
    }

    /**
     * 获取方法调用的code值
     *
     * @param val
     * @return
     */
    public String getResult(String val) {
        // 27.19.74.143 - - [30/May/2013:17:38:20 +0800] "GET /static/image/common/faq.gif HTTP/1.1" 200 1127
        //200前面有个空格
        try {
            val = val.substring(val.lastIndexOf("\"") + 2);
            String[] valArr = val.split(" ");
            log.info("[getResult] method code: " + valArr[0]);
            return valArr[0];
        } catch (Exception e) {
            log.error("", e);
        }
        return "0";
    }
}

然后是我们自定义的reduce：

package codemperor.mapreduce.accesslog.dailycount;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class DailyCountReduce extends Reducer<Text, DailyCountEntity, Text, DailyCountEntity> {

    @Override
    protected void reduce(Text key, Iterable<DailyCountEntity> values, Context context) throws IOException, InterruptedException {

        System.out.println("begin reduce");
        int success = 0;
        int failed = 0;
        int undefined = 0;
        for (DailyCountEntity dailyCountEntity : values) {
            success += dailyCountEntity.getSuccess();
            failed += dailyCountEntity.getFailed();
            undefined += dailyCountEntity.getUndefined();
        }

        DailyCountEntity dailyCountEntity = new DailyCountEntity();
        dailyCountEntity.setFailed(failed);
        dailyCountEntity.setSuccess(success);
        dailyCountEntity.setUndefined(undefined);

        context.write(key, dailyCountEntity);
    }
}

最终我们编写自定义driver，来启动这次作业（这里博主就在本地执行了，如果需要走服务器，请参考上面的demo）：

package codemperor.mapreduce.accesslog.dailycount;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Logger;

/**
 * 执行map和reduce
 */
public class DailyCountDriver {
    public static Logger log  = Logger.getLogger(DailyCountDriver.class);

    public static void main(String[] args) throws Exception {
        /**********配置********/
        Configuration configuration = new Configuration();
        log.info("daily count go go go");

        //编写作业
        Job job = Job.getInstance(configuration, "daily count");
        job.setJarByClass(DailyCountDriver.class);
        job.setMapperClass(DailyCountMapper.class);
        job.setReducerClass(DailyCountReduce.class);

        //设置map的输出类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(DailyCountEntity.class);

        //设置reduce的输出类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(DailyCountEntity.class);

        //下面是可以本地执行mr计算本地文件
        FileInputFormat.setInputPaths(job, new Path("hdfs-demo/src/main/java/codemperor/mapreduce/accesslog/access_2013_05_30.log"));
        FileOutputFormat.setOutputPath(job, new Path("hdfs-demo/output/daily_count_result"));

        boolean result = job.waitForCompletion(true);
        System.out.println(result);
    }
}

执行之后，我们可以看到最终结果：
在这里插入图片描述

扩展之使用NullWritable来忽略key

上面的结果看的好不爽啊，想要存到数据库，可能又要解析了，有没有一种办法，输出的结果直接就是一个json，这样读取之后解析炒鸡方便，当然可以啦，我们在DailyCountReduce中，稍微修改一下输出：

将reduce输出改为NullWritable
context.write(NullWritable.get(), dailyCountEntity);

然后修改一下我们的bean：

    /**
     *
     */
    private String date = "";
    /**
     * 成功次数
     */
    private Integer success = 0;
    /**
     * 失败次数
     */
    private Integer failed = 0;

    /**
     * 解析失败
     */
    private Integer undefined = 0;

	==记得readFields和write也要加哦。==

然后在跑一次，看到结果：
在这里插入图片描述
ok斯密达～

扩展之设置分区来写入不同文件

比如现在有个需求，要求我们成功调用的输出一个文件，失败调用的输出一个文件，剩下的输出到其他文件。那么如何做呢？这就需要我们来手动设置分区啦

创建一个DailyCountPartitioner

package codemperor.mapreduce.accesslog.dailycount;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

/**
 * 指定map之后数据key的切分规则，Partitioner参数是map的输出
 */
public class DailyCountPartitioner extends Partitioner<Text, DailyCountEntity> {

    /**
     * @param text
     * @param dailyCountEntity
     * @param numPartitions    作业的指定的reducer个数，决定了reduce输出文件的个数
     * @return
     */
    @Override
    public int getPartition(Text text, DailyCountEntity dailyCountEntity, int numPartitions) {

        if (dailyCountEntity.getFailed() == 1) {
            //失败的一个文件
            return 0;
        } else if (dailyCountEntity.getSuccess() == 1) {
            //成功的一个文件
            return 1;
        } else {
            //剩下的全部放在另一个文件中
            return 2;
        }
    }
}

然后在我们的DailyCountDriver中加入配置：

//设置分区和分区数量
job.setPartitionerClass(DailyCountPartitioner.class);
job.setNumReduceTasks(3);

DailyCountDriver代码如下：

package codemperor.mapreduce.accesslog.dailycount;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Logger;

/**
 * 执行map和reduce
 */
public class DailyCountDriver {
    public static Logger log  = Logger.getLogger(DailyCountDriver.class);

    public static void main(String[] args) throws Exception {
        /**********配置********/
        Configuration configuration = new Configuration();
        log.info("daily count go go go");

        //编写作业
        Job job = Job.getInstance(configuration, "daily count");
        job.setJarByClass(DailyCountDriver.class);
        job.setMapperClass(DailyCountMapper.class);
        job.setReducerClass(DailyCountReduce.class);

        //设置分区和分区数量
        job.setPartitionerClass(DailyCountPartitioner.class);
        job.setNumReduceTasks(3);

        //设置map的输出类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(DailyCountEntity.class);

        //设置reduce的输出类型
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(DailyCountEntity.class);

        //下面是可以本地执行mr计算本地文件
        FileInputFormat.setInputPaths(job, new Path("hdfs-demo/src/main/java/codemperor/mapreduce/accesslog/access_2013_05_31.log"));
        FileOutputFormat.setOutputPath(job, new Path("hdfs-demo/output/daily_count_result"));

        boolean result = job.waitForCompletion(true);
        System.out.println(result);
    }
}

我们手动设置了3个分区，然后最终运行之后，我们可以看到有三分文件输出：
在这里插入图片描述
里面就是最终结果啦～～啦啦啦～～～