【大数据笔记】- Hadoop MapReduce API

最新推荐文章于 2024-05-06 08:50:59 发布

菜鸟老胡~

最新推荐文章于 2024-05-06 08:50:59 发布

阅读量1.4k

点赞数 1

分类专栏：技术学习文章标签：分布式大数据 hadoop mapreduce hdfs

本文链接：https://blog.csdn.net/foxofwind/article/details/121408887

版权

技术学习专栏收录该内容

15 篇文章 2 订阅

订阅专栏

一.基础环境：

本文默认了你已经有一点的java基础，本机环境已安装java、maven、ide，配置好了相关的环境变量，且已经有可用的hadoop环境，已经用idea新建一个java maven项目。还要有一台linux客户机，可执行hadoop命令的。

以上环境有没完成的，自行去百度完成。

二.pom.xml引入包：

      <dependency>
          <groupId>org.apache.hadoop</groupId>
          <artifactId>hadoop-common</artifactId>
          <version>2.7.3</version>
      </dependency>
      <dependency>
          <groupId>org.apache.hadoop</groupId>
          <artifactId>hadoop-hdfs</artifactId>
          <version>2.7.3</version>
      </dependency>
      <dependency>
          <groupId>org.apache.hadoop</groupId>
          <artifactId>hadoop-client</artifactId>
          <version>2.7.3</version>
      </dependency>

三.准备统计文件并上传

1.新建一个文件word_test.txt

I have searched a thousand years，And I have cried a thousand tears。
I found everything I need，You are everything to me。

2.上传到hadoop

先 rz 上传到linux客户机，再执行下边命令上传到hdfs

hadoop fs -mkdir /tmp/mr_test/
hadoop fs -put ./word_test.txt /tmp/mr_test/

四.上代码(官方WordCount V1)

package com.yixin;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.StringTokenizer;

public class WordCount {

    public static class TokenizerMapper
            extends Mapper<Object, Text, Text, IntWritable> {

        private final static IntWritable one = new IntWritable(1);
        private final Text word = new Text();

        public void map(Object key, Text value, Context context
        ) throws IOException, InterruptedException {
            StringTokenizer itr = new StringTokenizer(value.toString());
            while (itr.hasMoreTokens()) {
                word.set(itr.nextToken());
                context.write(word, one);
            }
        }
    }

    public static class IntSumReducer
            extends Reducer<Text,IntWritable,Text,IntWritable> {
        private final IntWritable result = new IntWritable();

        public void reduce(Text key, Iterable<IntWritable> values,
                           Context context
        ) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable val : values) {
                sum += val.get();
            }
            result.set(sum);
            context.write(key, result);
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, "word count");
        job.setJarByClass(WordCount.class);
        job.setMapperClass(TokenizerMapper.class);
        job.setCombinerClass(IntSumReducer.class);
        job.setReducerClass(IntSumReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

五.打包上传执行

1.用ide或maven打包代码成jar，

2.rz上传到linux客户机

3.执行代码

hadoop jar mrtest-1.0-SNAPSHOT.jar com.yixin.WordCount /tmp/mr_test/word_test.txt /tmp/mr_test/output

4.查看结果

hadoop fs -cat /tmp/mr_test/output/*

六.官方WordCount V2

与v1的主要差别，在Map类中是增加了 setup(Context context) 方法，增加2个参数对大小写和忽略字符进行控制。具体说明我在代码中有备注。

实际上，map和reduce都有setup(Context context) ,cleanup(Context context)，前一个是初始化操作，后一个是做清理工作。在map或reduce方法的前后执行，大家可以根据业务合理使用。

1.上v2代码

package com.yixin;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.StringUtils;

public class WordCount2 {
    /**
     * 这里可以设置map和reduce用到的变量和方法。
     * 注：正常每个map和reduce是各用各的，修改互相不可见。除非你用的全局计数器或分布式缓存。
     */

    public static class TokenizerMapper
            extends Mapper<Object, Text, Text, IntWritable> {

        /**
         * 这里可以设置map用到的变量和方法。
         * 注：正常每个map是各用各的，修改互相不可见。除非你用的全局计数器或分布式缓存。
         */

        // 这就是个全局计数器，各map是可共享的，修改可见的。
        static enum CountersEnum {INPUT_WORDS}

        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();

        private boolean caseSensitive;
        private Set<String> patternsToSkip = new HashSet<String>();

        private Configuration conf;
        private BufferedReader fis;

        /**
         * setup方法可以执行一些map执行前的一些初始化工作，
         * 如对变量做初始化加工，设置数据库连接，输入路径过滤等。
         * 这个例子是对是否大小写做了个处理，对跳过字符的规则做了处理。
         */
        @Override
        public void setup(Context context) throws IOException,
                InterruptedException {
            conf = context.getConfiguration();
            caseSensitive = conf.getBoolean("wordcount.case.sensitive", true);
            if (conf.getBoolean("wordcount.skip.patterns", true)) {
                URI[] patternsURIs = Job.getInstance(conf).getCacheFiles();
                if(patternsURIs!=null){
                    for (URI patternsURI : patternsURIs) {
                        Path patternsPath = new Path(patternsURI.getPath());
                        String patternsFileName = patternsPath.getName().toString();
                        parseSkipFile(patternsFileName);
                    }
                }
            }
        }

        private void parseSkipFile(String fileName) {
            try {
                fis = new BufferedReader(new FileReader(fileName));
                String pattern = null;
                while ((pattern = fis.readLine()) != null) {
                    patternsToSkip.add(pattern);
                }
            } catch (IOException ioe) {
                System.err.println("Caught exception while parsing the cached file '"
                        + StringUtils.stringifyException(ioe));
            }
        }

        @Override
        public void map(Object key, Text value, Context context
        ) throws IOException, InterruptedException {
            String line = (caseSensitive) ?
                    value.toString() : value.toString().toLowerCase();
            for (String pattern : patternsToSkip) {
                line = line.replaceAll(pattern, "");
            }
            StringTokenizer itr = new StringTokenizer(line);
            while (itr.hasMoreTokens()) {
                word.set(itr.nextToken());
                context.write(word, one);
                Counter counter = context.getCounter(CountersEnum.class.getName(),
                        CountersEnum.INPUT_WORDS.toString());
                counter.increment(1);
            }
        }
    }

    public static class IntSumReducer
            extends Reducer<Text, IntWritable, Text, IntWritable> {
        private IntWritable result = new IntWritable();

        public void reduce(Text key, Iterable<IntWritable> values,
                           Context context
        ) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable val : values) {
                sum += val.get();
            }
            result.set(sum);
            context.write(key, result);
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        GenericOptionsParser optionParser = new GenericOptionsParser(conf, args);
        String[] remainingArgs = optionParser.getRemainingArgs();
        if (!(remainingArgs.length != 2 || remainingArgs.length != 4)) {
            System.err.println("Usage: wordcount <in> <out> [-skip skipPatternFile]");
            System.exit(2);
        }
        Job job = Job.getInstance(conf, "word count");
        job.setJarByClass(WordCount2.class);
        job.setMapperClass(TokenizerMapper.class);
        job.setCombinerClass(IntSumReducer.class);
        job.setReducerClass(IntSumReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        List<String> otherArgs = new ArrayList<String>();
        for (int i = 0; i < remainingArgs.length; ++i) {
            if ("-skip".equals(remainingArgs[i])) {
                job.addCacheFile(new Path(remainingArgs[++i]).toUri());
                job.getConfiguration().setBoolean("wordcount.skip.patterns", true);
            } else {
                otherArgs.add(remainingArgs[i]);
            }
        }
        FileInputFormat.addInputPath(job, new Path(otherArgs.get(0)));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(1)));

        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

2.运行方式a，和v1参数一样

打包上传和v1一样不说了，运行是如下命令：

hadoop fs -rm -r /tmp/mr_test/output
hadoop jar mrtest-1.0-SNAPSHOT.jar com.yixin.WordCount2 /tmp/mr_test/word_test.txt /tmp/mr_test/output

结果：

hadoop fs -cat /tmp/mr_test/output/*

3.新建文件patterns.txt（跳过字符的规则）,并上传到集群。

新建，编辑，保存

vim patterns.txt

\.
\,
\!
to
\，
\。

上传：

hadoop fs -put patterns.txt /tmp/mr_test/

运行方法b，多加2个参数，跳过不需要统计的字符

hadoop fs -rm -r /tmp/mr_test/output
hadoop jar mrtest-1.0-SNAPSHOT.jar com.yixin.WordCount2 /tmp/mr_test/word_test.txt /tmp/mr_test/output -skip /tmp/mr_test/patterns.txt

结果：

hadoop fs -cat /tmp/mr_test/output/*

七.MR简化配置工具ToolRunner：

可以使这个工具来预配置，这样能简化使用方的代码量，如精简main方法，只需要2行代码，这个我就不写例子，推荐大家看这个文章：

使用ToolRunner运行Hadoop程序基本原理分析_jediael_lu的专栏-CSDN博客

菜鸟老胡~

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
【大数据笔记】- Hadoop MapReduce API

一.基础环境：本文默认了你已经有一点的java基础，本机环境已安装java、maven、ide，配置好了相关的环境变量，且已经有可用的hadoop环境，已经用idea新建一个java maven项目。还要有一台linux客户机，可执行hadoop命令的。以上环境有没完成的，自行去百度完成。二.pom.xml引入包： <dependency> <groupId>org.apache.hadoop</groupId> ...
复制链接

扫一扫