大数据—— 使用 MapReduce 实现wordcount

最新推荐文章于 2022-10-30 12:23:17 发布

Vicky_Tang

最新推荐文章于 2022-10-30 12:23:17 发布

阅读量9.8k

点赞数 2

分类专栏： Hadoop 文章标签： maven hadoop big data

本文链接：https://blog.csdn.net/sweet19920711/article/details/120096667

版权

Hadoop 专栏收录该内容

6 篇文章 2 订阅

订阅专栏

一、创建 maven 工程并导入依赖

<?xml version="1.0" encoding="UTF-8"?>

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>

  <groupId>cn.kgc</groupId>
  <artifactId>mapreduce</artifactId>
  <version>1.0</version>

  <name>mapreduce</name>
  <!-- FIXME change it to the project's website -->
  <url>http://www.example.com</url>

  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <maven.compiler.source>1.8</maven.compiler.source>
    <maven.compiler.target>1.8</maven.compiler.target>
    <hadoop.version>3.1.3</hadoop.version>
    <log4j.version>1.2.17</log4j.version>
    <slf4j.version>2.0.0-alpha1</slf4j.version>
  </properties>

  <dependencies>
    <dependency>
      <groupId>log4j</groupId>
      <artifactId>log4j</artifactId>
      <version>${log4j.version}</version>
    </dependency>
    <dependency>
      <groupId>org.slf4j</groupId>
      <artifactId>slf4j-log4j12</artifactId>
      <version>${slf4j.version}</version>
    </dependency>
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-client</artifactId>
      <version>${hadoop.version}</version>
    </dependency>
  </dependencies>


  <build>
    <plugins>
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-compiler-plugin</artifactId>
        <version>2.3.2</version>
        <configuration>
          <source>1.8</source>
          <target>1.8</target>
        </configuration>
      </plugin>
      <!-- 用于打胖包 -->
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-assembly-plugin</artifactId>
        <version>2.3</version>
        <configuration>
          <descriptorRefs>
            <descriptorRef>jar-with-dependencies</descriptorRef>
          </descriptorRefs>
          <archive>
            <!-- 指定主类 -->
            <manifest>
              <main-class>cn.kgc.mapreduce.App</main-class>
            </manifest>
          </archive>
        </configuration>
        <executions>
          <execution>
            <id>make-assemply</id>
            <phase>package</phase>
            <goals>
              <goal>single</goal>
            </goals>
          </execution>
        </executions>
      </plugin>
    </plugins>
  </build>
</project>

二、编写Mapper代码

package cn.kgc.mapreduce;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
import java.util.StringTokenizer;

public class WordCountMapper extends Mapper<LongWritable, Text,Text, IntWritable> {
    //正则替换文章中的标点符号，以英文文章为例
    final String REP_REG = "!|;|,|\\.|\\?|'";
    Text keyOut = new Text();
    IntWritable valueOut = new IntWritable(1);

    @Override
    //Context: 上下文，把多余线程的数据聚合到一起
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //StringTokenizer默认分隔符为空格
        StringTokenizer it = new StringTokenizer(value.toString().replaceAll(REP_REG, "")
                //将2个以上的空格替换成1个
                .replaceAll(" {2,}"," "));
        while (it.hasMoreElements()) {
            keyOut.set(it.nextToken());
            context.write(keyOut,valueOut);
        }
    }
}

三、编写 Reducer 代码

package cn.kgc.mapreduce;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;

import javax.xml.soap.Text;
import java.io.IOException;
import java.util.Iterator;

public class WordCountReducer extends Reducer<Text, IntWritable,Text,IntWritable> {
    IntWritable valueOut = new IntWritable();
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        int sum = 0;
        Iterator<IntWritable> it = values.iterator();
        while (it.hasNext()) {
            sum += it.next().get();
        }
        valueOut.set(sum);
        context.write(key,valueOut);
    }
}

四、编写 Job 执行代码

package cn.kgc.mapreduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.net.URI;

public class App {
    public static void main(String[] args) {
        //加载hadoop默认配置项
        Configuration conf = new Configuration(true);

        //跨平台提交
        conf.set("mapreduce.app-submission.cross-platform", "true");

        final Path pathIn = new Path("/test/data/story.txt");
        final Path pathOut = new Path("/test/out/");
        FileSystem fs = null;

        try {
            //创建FileSystem访问地址
            fs = FileSystem.get(URI.create("hdfs://192.168.131.200:9820"),conf,"root");
            //判断输入文件是否存在
            if (!fs.exists(pathIn)){
                System.err.println(pathIn.getName()+" doesn't exist");
                return;
            }
            //判断输出文件目录是否已存在
            if (fs.exists(pathOut)){
                System.err.println(pathOut.getName()+" has existed");
                return;
            }
            //文件系统只是用来判断文件或路径是否存在，用完需要释放资源
            fs.close();

            //创建job
            Job wcJob = Job.getInstance(conf, "wc");
            //指定jar包路径（需要打包）
            wcJob.setJar("target/mapreduce-1.0-jar-with-dependencies.jar");
            //指定jar包和jar包中需要被执行的类名
            wcJob.setJarByClass(App.class);

            //指定Mapper的类名和输出的Key-Value类型
            wcJob.setMapperClass(WordCountMapper.class);
            wcJob.setMapOutputKeyClass(Text.class);
            wcJob.setMapOutputValueClass(IntWritable.class);

            //指定Combiner类名
            wcJob.setCombinerClass(WordCountReducer.class);

            //setNumReduceTasks表示设置ReduceTask数量
            wcJob.setNumReduceTasks(1);

            //指定Reducer的类名和最终输出的Key-Value类型
            wcJob.setReducerClass(WordCountReducer.class);
            wcJob.setOutputKeyClass(Text.class);
            wcJob.setOutputValueClass(IntWritable.class);

            //添加输入文件
            FileInputFormat.addInputPath(wcJob,pathIn);
            //设置输出文件路径
            FileOutputFormat.setOutputPath(wcJob,pathOut);

            //true表示将运行进度等信息及时输出给用户，false的话只是等待作业结束
            wcJob.waitForCompletion(true);

        }  catch (Exception e) {
            e.printStackTrace();
        }

    }
}