大数据hadoop3.1.3——WordCount案例操作--Windows本地提交集群以及打jar包到集群上执行

WordCount案例实操

1．需求
在给定的文本文件中统计输出每一个单词出现的总次数
（1）输入数据到*.txt文件，预计出现次数如下：
banzhang 1
cls 2
hadoop 1
jiao 1
ss 2
xue 1
2．需求分析

按照MapReduce编程规范，分别编写Mapper，Reducer，Driver

WordCount需求分析

3．环境准备

（1）创建maven工程

（2）在pom.xml文件中添加如下依赖

<dependencies>   
  <dependency>
       <groupId>junit</groupId>
       <artifactId>junit</artifactId>
       <version>4.12</version>   
  </dependency>   
  <dependency>       
       <groupId>org.apache.logging.log4j</groupId>
       <artifactId>log4j-slf4j-impl</artifactId>
       <version>2.12.0</version>   
  </dependency>   
  <dependency>
       <groupId>org.apache.hadoop</groupId>
       <artifactId>hadoop-client</artifactId>       
       <version>3.1.3</version>   
   </dependency>
</dependencies>

（2）在项目的src/main/resources目录下，新建一个文件，命名为“log4j2.xml”，在文件中填入。

  <?xml version="1.0"  encoding="UTF-8"?>
  <Configuration  status="error" strict="true" name="XMLConfig">
       <Appenders>
           <!-- 类型名为Console，名称为必须属性  -->
           <Appender type="Console" name="STDOUT">
              <!-- 布局为PatternLayout的方式，输出样式为[INFO]  [2018-01-22 17:34:01][org.test.Console]I'm here -->
              <Layout type="PatternLayout"  pattern="[%p]  [%d{yyyy-MM-dd HH:mm:ss}][%c{10}]%m%n" />
           </Appender>   
       </Appenders>   
       <Loggers>
           <!-- 可加性为false -->
           <Logger name="test" level="info"  additivity="false">
              <AppenderRef  ref="STDOUT" />
           </Logger>   
           <!-- root loggerConfig设置  -->
           <Root level="info">
              <AppenderRef  ref="STDOUT" />
           </Root>
       </Loggers>   
  </Configuration>

4．编写程序
（1）编写Mapper类

package com.caron.mr.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * 自定义Mapper:  继承Mapper类, 指定4个泛型.  4个泛型表示两组kv对.
 *
 * 4个泛型 表示两组kv对， 一组是输入数据的kv  ，一组是输出数据的kv
 *
 * 输入数据的kv:
 * KEYIN   :  LongWritable , 表示从文件中读取数据的偏移量. (读取到了什么位置，下一次从哪个位置读取)
 * VALUEIN :  Text , 表示从文件中读取的一行数据.
 *
 * 输出数据的kv:
 * KEYOUT  :  Text, 表示一个单词
 * VALUEOUT:  IntWritable , 表示单词出现了1次.
 */
public class WordCountMapper extends Mapper<LongWritable, Text,Text, IntWritable> {

    //定义输出的v
    IntWritable outV = new IntWritable(1);

    //定义输出的k
    Text outK = new Text();

    /**
     * 重写Mapper类中的map方法
     * @param key   表示输入的k
     * @param value 表示输入的v ，就是文件中读取的一行内容
     * @param context  负责调度Mapper运行
     */
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //1. 将输入的一行数据，转换成String类型.
        //  atguigu atguigu
        String line = value.toString();
        //2. 使用空格切分数据
        String[] splits = line.split(" ");
        //3. 迭代splits数组，将每个单词处理成kv，写出去.
        for (String word : splits) {
            //将当前单词设置到outK中
            outK.set(word);
            //写出
            context.write(outK,outV);
        }
    }
}

（2）编写Reducer类

package com.caron.mr.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
 * 自定义Reducer, 需要继承Reducer类,指定4个泛型 , 4个泛型表示两组kv对.
 *
 * 4个泛型，表示两组kv对， 一组是输入的kv ， 一组是输出的kv
 * 输入的kv对:
 * KEYIN   : Text,对应Map输出的k, 表示一个单词
 * VALUEIN ：IntWritable , 对应Map输出的v, 表示单词出现的次数
 *
 * 输出的kv对:
 * KEYOUT  : Text, 表示一个单词
 * VALUEOUT: IntWritable , 表示这个单词出现的总次数

 */
public class WordCountReducer extends Reducer<Text, IntWritable,Text,IntWritable> {
    //定义写出的v
    IntWritable outV = new IntWritable();
    /**
     * 重写Reducer中的reudce方法。
     * @param key  表示输入的key, 就是一个单词
     * @param values 表示封装了当前key对应的所有的value的一个迭代器对象.
     * @param context 负责调度Reducer运行
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        //1. 迭代values,将当前key对应的所有的value累加.
        int sum = 0 ;
        for (IntWritable value : values) {
            sum += value.get() ;
        }
        //2. 写出
        //将累加后的结果sum封装到outV中
        outV.set(sum);
        context.write(key,outV);
    }
}

（3）编写Driver驱动类

package com.caron.mr.wordcount;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;

public class WordCountDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        //1. 创建一个Job对象
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        //2. 关联jar
        job.setJarByClass(WordCountDriver.class);
        //3. 关联Mapper 和 Reducer 类
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReducer.class);
        //4. 设置Mapper的输出key和value的类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        //5. 设置最终输出的key和value的类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        //6. 设置输入和输出路径
        //FileInputFormat.setInputPaths(job,new Path("F:/input/inputWord.txt"));
       // FileOutputFormat.setOutputPath(job,new Path("F:/output")); // 输出路径不能存在，如果已经存在就报异常.
        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));
        //7. 提交job
        job.waitForCompletion(true);
    }
}

5．本地测试

（1）需要首先配置好HadoopHome变量以及Windows运行依赖
（2）在Eclipse/Idea上运行程序

6．集群上测试

（0）用maven打jar包，如果有需要一并打进去的依赖，需要添加打包插件

注意：标记加粗的部分需要替换为自己工程主类

<build>
  <plugins>
     <plugin>
        <artifactId>maven-compiler-plugin</artifactId>
        <version>2.3.2</version>
        <configuration>
           <source>1.8</source>
           <target>1.8</target>
        </configuration>
     </plugin>
     <plugin>
        <artifactId>maven-assembly-plugin</artifactId>
        <configuration>
           <descriptorRefs>
              <descriptorRef>jar-with-dependencies</descriptorRef>
           </descriptorRefs>
           <archive>
              <manifest>
              <mainClass>

com.caron.mr.WordcountDriver

              </mainClass>  
                 </manifest>
           </archive>
        </configuration>
        <executions>
           <execution>
              <id>make-assembly</id>
              <phase>package</phase>
              <goals>
                 <goal>single</goal>
              </goals>
           </execution>
        </executions>
     </plugin>
  </plugins>
</build>

注意：如果工程上显示红叉。在项目上右键->maven->update project即可。

（1）将程序打成jar包，然后拷贝到Hadoop集群中

步骤详情：Maven ->lifecycle->install。等待编译完成就会在项目的target文件夹中生成jar包。如果看不到。在项目上右键 -> Refresh，即可看到。修改不带依赖的jar包名称为wc.jar，并拷贝该jar包到Hadoop集群。

（2）启动Hadoop集群

（3）执行WordCount程序
hadoop jar wc.jar com.caron.wordcount.WordcountDriver
/user/caron/input /user/caron/output

7．在Windows上向集群提交任务
（1）添加必要配置信息

public class WordcountDriver { 
  public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException
{ 
      //1 获取配置信息以及封装任务
      Configuration configuration = new Configuration();
      //设置HDFS NameNode的地址      
      configuration.set("fs.defaultFS","hdfs://hadoop101:9820");
      // 指定MapReduce运行在Yarn上
      configuration.set("mapreduce.framework.name","yarn");
      // 指定mapreduce可以在远程集群运行
      configuration.set("mapreduce.app-submission.cross-platform","true");    
      //指定Yarnresourcemanager的位置
      configuration.set("yarn.resourcemanager.hostname","hadoop102");
      Job job = Job.getInstance(configuration); 
      //2 设置jar加载路径
      job.setJarByClass(WordcountDriver.class); 
      //3 设置map和reduce类
      job.setMapperClass(WordcountMapper.class);
      job.setReducerClass(WordcountReducer.class);
      //4 设置map输出
      job.setMapOutputKeyClass(Text.class);
      job.setMapOutputValueClass(IntWritable.class);
      //5 设置最终输出kv类型
      job.setOutputKeyClass(Text.class);
      job.setOutputValueClass(IntWritable.class);     
      //6 设置输入和输出路径
      FileInputFormat.setInputPaths(job,new Path(args[0]));
      FileOutputFormat.setOutputPath(job,new Path(args[1]));
      //7 提交
      boolean result = job.waitForCompletion(true); 
      System.exit(result? 0 : 1);
  }
}

（2）先进行打包，并将Jar包设置到Driver中，集群中运行需要指定jar包

public class WordcountDriver { 
  public static void main(String[] args) throws IOException, ClassNotFoundException,InterruptedException { 
      //1 获取配置信息以及封装任务
      Configuration configuration = new Configuration();       
      configuration.set("fs.defaultFS","hdfs://hadoop101:9820");      
      configuration.set("mapreduce.framework.name","yarn");      
      configuration.set("mapreduce.app-submission.cross-platform","true");   
      configuration.set("yarn.resourcemanager.hostname","hadoop102");
      Job job = Job.getInstance(configuration);
      //2 设置jar加载路径
      job.setJar("D:\\input\\MapReduce-1.0-SNAPSHOT.jar"); 
      //3 设置map和reduce类
      job.setMapperClass(WordcountMapper.class);
      job.setReducerClass(WordcountReducer.class);
      //4 设置map输出
      job.setMapOutputKeyClass(Text.class);
      job.setMapOutputValueClass(IntWritable.class);
      //5 设置最终输出kv类型
      job.setOutputKeyClass(Text.class);
      job.setOutputValueClass(IntWritable.class);
      //6 设置输入和输出路径
      FileInputFormat.setInputPaths(job,new Path(args[0]));
      FileOutputFormat.setOutputPath(job,new Path(args[1]));
      //7 提交
      boolean result = job.waitForCompletion(true);
      System.exit(result? 0 : 1);
  }
}