mapreduce基础: 手写本地wordcount案例

该文章展示了如何使用Java实现一个HadoopMapReduce程序来执行WordCount任务。包括Mapper、Reducer类的详细代码解释,以及Driver类如何提交任务。示例代码中,Mapper将文本拆分成单词,Reducer计算每个单词的总数。最后,提供了项目的POM.xml配置和测试数据。
摘要由CSDN通过智能技术生成

一、源代码

1. WordCountMapper类

package org.example.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class WordCountMapper extends Mapper<LongWritable, Text,Text, IntWritable> {

    //新建输出文本对象(输出的key类型)
    private Text text = new Text();
    //新建输出IntWritable对象(输出的value类型)
    private IntWritable intWritable = new IntWritable( 1);


    /**
     * 重写map方法
     * @param key 文本的索引
     * @param value 文本值
     * @param context 上下文对象
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        //获取拆分后的一行文本

        //mysql mysql value value value
        String line = value.toString();

        //根据分隔符进行单词拆分
        String[] words = line.split( " ");

        //循环创建键值对
        for (String word : words){

            //输出key值设置
            text.set (word) ;

            //进行map输出
            //igeek igeek -> <igeek ,1> <igeek,1>
            context.write(text,intWritable);
        }

    }
}

2. WordCountReducer类

package org.example.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class WordCountReducer extends Reducer<Text, IntWritable,Text, IntWritable> {

    //输出value对象
    private IntWritable valueOut = new IntWritable();


    /**
     * 重写reduce方法
     * @param key 单词值
     * @param values 单词出现的次数集合
     * @param context   上下文对象
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

        //每个单词出现的次数
        int sum= 0;

        //<igeek,(1,1)>
        for (IntWritable value : values){

        //累计单词出现的数量
            sum += value.get();
        }

        //进行封装
        valueOut.set(sum);

        // reduce输出
        context.write(key, valueOut);


    }
}

3. WordCountDriver类

package org.example.wordcount;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.example.wordcounttemplate.WordCountMapper;
import org.example.wordcounttemplate.WordCountReducer;

import java.io.IOException;

/**
 * 充当mapreduce任务的客户端,用于提交任务
 */

public class WordCountDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//        1.获取配置信息,获取job对象实例
        Configuration conf=new Configuration();
        Job job=Job.getInstance(conf);

//        2.关联本Driver得jar路径
        job.setJarByClass(WordCountDriver.class);

//        3.关联map和reduce
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReducer.class);

//        4.设置map得输出kv类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

//        5.设置最终输出得kv类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

//        6.设置输入和输出路径
//        setInputPaths:可传多个输入文件
//         FileInputFormat.setInputPaths(job,new Path(args[0]));
//         FileOutputFormat.setOutputPath(job,new Path(args[1]));
        
        Path inputPath = new Path("D:\\Documentation\\Hello.txt");
        FileInputFormat.setInputPaths(job,inputPath);

        Path outputPath = new Path("D:\\Documentation\\outHello");
        FileOutputFormat.setOutputPath(job,outputPath);

//        7.提交job任务
        boolean result=job.waitForCompletion(true);
        System.out.println(result?"任务提交成功":"任务提交失败");

    }

}

  1. pom.xml
<?xml version="1.0" encoding="UTF-8"?>

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>

  <groupId>org.example</groupId>
  <artifactId>mapreduce_demo</artifactId>
  <version>1.0-SNAPSHOT</version>

  <name>mapreduce_demo</name>
  <!-- FIXME change it to the project's website -->
  <url>http://www.example.com</url>

  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <maven.compiler.source>1.8</maven.compiler.source>
    <maven.compiler.target>1.8</maven.compiler.target>
  </properties>

  <dependencies>
    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-client</artifactId>
      <version>3.1.3</version>
    </dependency>

  </dependencies>

  <build>
    <pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
      <plugins>
        <!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
        <plugin>
          <artifactId>maven-clean-plugin</artifactId>
          <version>3.1.0</version>
        </plugin>
        <!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
        <plugin>
          <artifactId>maven-resources-plugin</artifactId>
          <version>3.0.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-compiler-plugin</artifactId>
          <version>3.8.0</version>
        </plugin>
        <plugin>
          <artifactId>maven-surefire-plugin</artifactId>
          <version>2.22.1</version>
        </plugin>
        <plugin>
          <artifactId>maven-jar-plugin</artifactId>
          <version>3.0.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-install-plugin</artifactId>
          <version>2.5.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-deploy-plugin</artifactId>
          <version>2.8.2</version>
        </plugin>
        <!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
        <plugin>
          <artifactId>maven-site-plugin</artifactId>
          <version>3.7.1</version>
        </plugin>
        <plugin>
          <artifactId>maven-project-info-reports-plugin</artifactId>
          <version>3.0.0</version>
        </plugin>
      </plugins>
    </pluginManagement>
  </build>
</project>

  1. Hello.txt 测试文档

mysql mysql value value value
igeek igeek
mysql hive port property
mysql value value value property
mysql value port property
mysql hive port property
mysql value hive hive hive property
mysql hive port property
mysql port property
mysql value hive hive port property
value property
value hive

在这里插入图片描述

二、运行截图

在这里插入图片描述
在这里插入图片描述

在这里插入图片描述

在这里插入图片描述

对照文章:
大数据作业4(含大数据hdfs上实现wordcount案例)
https://blog.csdn.net/m0_48170265/article/details/130029532?spm=1001.2014.3001.5501

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值