Hadoop WordCount 固定路径（小案例）

最新推荐文章于 2024-08-07 12:00:09 发布

是大师啊

最新推荐文章于 2024-08-07 12:00:09 发布

阅读量353

点赞数

分类专栏： hadoop 文章标签： hadoop

本文链接：https://blog.csdn.net/LuLaaa/article/details/89221611

版权

hadoop 专栏收录该内容

4 篇文章 0 订阅

订阅专栏

POM文件依赖与插件部分


  <dependencies>
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.11</version>
      <scope>test</scope>
    </dependency>

    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-common</artifactId>
      <version>2.8.5</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs -->
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-hdfs</artifactId>
      <version>2.8.5</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-client</artifactId>
      <version>2.8.5</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core -->
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-mapreduce-client-core</artifactId>
      <version>2.8.5</version>
    </dependency>
  </dependencies>

  <build>
  
      <plugins>
        <plugin>
          <groupId>org.apache.maven.plugins</groupId>
          <artifactId>maven-jar-plugin</artifactId>
          <version>2.4</version>
          <configuration>
            <archive>
              <manifest>
                <addClasspath>true</addClasspath>  <!-- 告知 maven-jar-plugin 添加一个 Class-Path 元素到 MANIFEST.MF 文件 -->
                <classpathPrefix>lib/</classpathPrefix> <!-- classpathPrefix 指出，相对存档文件，所有的依赖项应该位于 “lib” 文件夹 -->
                <mainClass>hp.wordsCount.WordCountDriver</mainClass><!--   当用户使用 lib 命令执行 JAR 文件时，使用该元素定义将要执行的类名-->
              </manifest>
            </archive>
          </configuration>
        </plugin>
    
        <plugin>
          <artifactId>maven-resources-plugin</artifactId>
          <version>3.0.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-compiler-plugin</artifactId>
          <version>3.8.0</version>
        </plugin>
        <plugin>
          <artifactId>maven-surefire-plugin</artifactId>
          <version>2.22.1</version>
        </plugin>

        <plugin>
          <artifactId>maven-install-plugin</artifactId>
          <version>2.5.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-deploy-plugin</artifactId>
          <version>2.8.2</version>
        </plugin>
        <plugin>
          <artifactId> maven-assembly-plugin </artifactId>
          <configuration>
            <descriptorRefs>
              <descriptorRef>jar-with-dependencies</descriptorRef>
            </descriptorRefs>
          </configuration>
          <executions>
            <execution>
              <id>make-assembly</id>
              <phase>package</phase>
              <goals>
                <goal>single</goal>
              </goals>
            </execution>
          </executions>
        </plugin>
      </plugins>
    </pluginManagement>
  </build>
</project>

Map阶段

package hp.wordsCount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 *
 * key1  传入的数据偏移量 类型为long   valuein 为一行数据  类型问string
 * key2  一个单词文本  类型String    valueout  key2单词出现的次数 类型为Int
 */
public class WordsCountMapper extends Mapper<LongWritable,Text,Text,IntWritable> {
    /**
     * mapper阶段具体业务逻辑实现方法  该方法调用取决于读取数据的组件有没有给mr传入数据
     * 每传入一个<k,v>调用一次方法
     * @param key
     * @param value
     * @param context
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
                String line =value.toString();
                String[] words =line.split(" ");
        for (String word:words
             ) {
            //使用mr程序中的上下文context 把mapper阶段处理的数据发送出去作为reduce阶段的输入数据
            context.write(new Text(word),new IntWritable(1));
        }
    }
}

Reduce阶段

package hp.wordsCount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * reducer 处理阶段
 *
 * keyIn  Reducer阶段输入的数据key类型，对应mapper的输出key类型  单词Text
 *
 * value1 对应mapper阶段输出的value类型 intWritable
 *
 * keyout 对应reducer输出 在这里为单词 TEXT
 *
 * value2  对应输出  在这里为 次数 IntWritable
 */
public class WordCountReducer extends Reducer<Text,IntWritable,Text,IntWritable>{
    /**
     * 重写reduce方法
     * reducer阶段具体业务的实现方法
     * @param key
     * @param values
     * @param context
     * @throws IOException
     * @throws InterruptedException
     *
     * reduce接受所有来自map阶段处理的数据，按照key的字典序进行排序
     */
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

            int count =0;
        for (IntWritable v:values
             ) {
            count+=v.get();
        }
         context.write(key,new IntWritable(count));
    }
}

Driver运行主类

package hp.wordsCount;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * @program: hadoop
 * mr程序运行时候的主类，组装了一些程序运行时候所需的信息
 * 如：使用哪个mapper类  哪个reducer 类
 */
public class WordCountDriver {
    public static void main(String[] args) throws Exception {
            //通过job来封装本次mr的相关信息
        Configuration conf=new Configuration();

        Job job = Job.getInstance(conf);
        //指定本次mr job jar 运行主类
        job.setJarByClass(WordCountDriver.class);

        //指定所用mapper  reducer
        job.setMapperClass(WordsCountMapper.class);
        job.setReducerClass(WordCountReducer.class);

        //指定mapper阶段输出 kv 类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        //指定最终 阶段输出类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        //指定本次 mr 输入数据路径 和输出结果存放位置
        FileInputFormat.setInputPaths(job,"/wordcount/input");
        FileOutputFormat.setOutputPath(job,new Path("/wordcount/output"));

        boolean b = job.waitForCompletion(true);
        System.exit(b?0:1);
    }
}

打包注意用assembly插件将依赖一同打包。否则运行时报Exception in thread "main" java.lang.ClassNotFoundException:错误

本地运行模式

将conf 配置

conf.set("mapreduce.framework.name","local");

是大师啊

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
Hadoop WordCount 固定路径（小案例）

POM文件依赖与插件部分 <dependencies> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.11</version> ...
复制链接

扫一扫

专栏目录