Hadoop MapReduce基础实例

最新推荐文章于 2024-08-17 21:28:57 发布

Biggoose1125

最新推荐文章于 2024-08-17 21:28:57 发布

阅读量1.3k

点赞数

分类专栏：学习笔记文章标签：大数据 java mapreduce hadoop

本文链接：https://blog.csdn.net/Juuunn/article/details/105175680

版权

学习笔记专栏收录该内容

20 篇文章 3 订阅

订阅专栏

本文记录Hadoop学习过程中第一个MapReduce实例

作者为新手小白，只为记录学习&交流
如任何读者有任何正面建议，欢迎留言&私信，不胜感激！
2020年3月29日11:46:51

文章目录

本文记录Hadoop学习过程中第一个MapReduce实例
后记
- 至此完成WordCount基础统计实例，欢迎大家在留言处给予建议或指出不足，感谢！

问题描述

任务目标：根据user_login.txt的数据统计用户在2016年度每个自然日的总访问次数。
user_login.txt： 在这里插入图片描述
最后生成的文件查看应该有如下效果：

开发环境

OS： Win10
IDE： Intelij IDEA 2019.3.3
Hadoop版本： Hadoop-3.1.2

任务过程描述

在idea导入hadoop所依赖的jar包后咱们可开始编写第一个mapreduce程序。
mapreduce作为一个框架，编程人员只需编写核心的map，reduce和自定义job相关即可完成mapreduce的wordcount功能
需知： 由于代码中加入了注释所以不在文章中加入过多描述，有问题可私聊博主
开始！
定义自己的三个类：DcMapper，DcReducer，DcDriver
在这里插入图片描述
其中：
DcMapper继承Mapper类并覆写map方法
DcReducer继承Ruducer类并覆写reduce方法

Mapper类

mapper类

package com.hadoop.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class WcMapper extends Mapper<LongWritable, Text,Text, IntWritable> {

    private Text word=new Text();
    private IntWritable one=new IntWritable(1);
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//        拿到一行数据
        String line=value.toString();
//        用逗号分隔开
        String[] words=line.split(",");
//        设置要传递的数据
        String word=words[1];
        this.word.set(word);
        context.write(this.word,one);

    }
}

Reducer类

package com.hadoop.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;


import java.io.IOException;

public class WcReducer extends Reducer<Text, IntWritable,Text,IntWritable> {
    private IntWritable total=new IntWritable();
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
//        累加
        int sum = 0;
        for (IntWritable value : values) {
            sum+=value.get();
        }
//        包装结果并输出
        total.set(sum);
        context.write(key,total);
    }
}

Driver类（本地模式）

Driver类可以针对处理本地的文件实现本地模式的访问，也可以对hdfs上的文件进行操作

package com.hadoop.wordcount;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class WcDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//        1.设置一个job对象
        Job job = Job.getInstance(new Configuration());
//        2.设置类路径
        job.setJarByClass(WcDriver.class);

//        3.设置Mapper和Reducer
        job.setMapperClass(WcMapper.class);
        job.setReducerClass(WcReducer.class);

//        4.设置输出类型
        //（1）.mapper的输出的key的类型
        job.setMapOutputKeyClass(Text.class);
        //（2）.mapper的输出的value的类型
        job.setMapOutputValueClass(IntWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

//        5.设置输入输出数据和输出结果path
//        FileInputFormat.setInputPaths(job,new Path(args[0]));
//        FileOutputFormat.setOutputPath(job,new Path(args[1]));
        Path input = new Path("E:\\input\\user_login.txt");
        Path output = new Path("E:\\output");
        FileInputFormat.setInputPaths(job, input);
        FileOutputFormat.setOutputPath(job, output);

//        6.提交job
        boolean b = job.waitForCompletion(true);
//        成功返回0失败返回1
        System.exit(b ? 0 : 1);
    }
}

注意： mapreduce会自动产生一个文件夹储存结果文件和相关文件，也就是说，在定义output的path时，path的最后一个文件夹名称是运行之前不存在的（或者说程序运行后自动新建的）

Driver类（集群模式）

package com.hadoop.MapReduce.wordcount;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class WcDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        
        //集群模式下需要加以下代码来设置登录权限
        System.setProperty("HADOOP_USER_NAME", "root");
        //以下设置端口号，HA：8020，非HA：9000,
        //如不定义的话下面inputformat需要写带端口号的完整地址
        configuration.set("fs.defaultFS", "hdfs://master:9000");


//        1.设置一个job对象
        Job job = Job.getInstance(new Configuration());
//        2.设置类路径
        job.setJarByClass(WcDriver.class);

//        3.设置Mapper和Reducer
        job.setMapperClass(WcMapper.class);
        job.setReducerClass(WcReducer.class);

//        4.设置输出类型
        //（1）.mapper的输出的key的类型
        job.setMapOutputKeyClass(Text.class);
        //（2）.mapper的输出的value的类型
        job.setMapOutputValueClass(IntWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

//        5.设置输入输出数据和输出结果path
//        FileInputFormat.setInputPaths(job,new Path(args[0]));
//        FileOutputFormat.setOutputPath(job,new Path(args[1]));
        Path input = new Path("hdfs://master:9000/user_login.txt");
        Path output = new Path("hdfs://master:9000/output");
        FileInputFormat.setInputPaths(job, input);
        FileOutputFormat.setOutputPath(job, output);

//        6.提交job
        boolean b = job.waitForCompletion(true);
//        成功返回0失败返回1
        System.exit(b ? 0 : 1);
    }
}