POM文件依赖与插件部分
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.8.5</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.8.5</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.8.5</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.8.5</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>2.4</version>
<configuration>
<archive>
<manifest>
<addClasspath>true</addClasspath> <!-- 告知 maven-jar-plugin 添加一个 Class-Path 元素到 MANIFEST.MF 文件 -->
<classpathPrefix>lib/</classpathPrefix> <!-- classpathPrefix 指出,相对存档文件,所有的依赖项应该位于 “lib” 文件夹 -->
<mainClass>hp.wordsCount.WordCountDriver</mainClass><!-- 当用户使用 lib 命令执行 JAR 文件时,使用该元素定义将要执行的类名-->
</manifest>
</archive>
</configuration>
</plugin>
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.0</version>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.22.1</version>
</plugin>
<plugin>
<artifactId>maven-install-plugin</artifactId>
<version>2.5.2</version>
</plugin>
<plugin>
<artifactId>maven-deploy-plugin</artifactId>
<version>2.8.2</version>
</plugin>
<plugin>
<artifactId> maven-assembly-plugin </artifactId>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>
Map阶段
package hp.wordsCount;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
*
* key1 传入的数据偏移量 类型为long valuein 为一行数据 类型问string
* key2 一个单词文本 类型String valueout key2单词出现的次数 类型为Int
*/
public class WordsCountMapper extends Mapper<LongWritable,Text,Text,IntWritable> {
/**
* mapper阶段具体业务逻辑实现方法 该方法调用取决于读取数据的组件有没有给mr传入数据
* 每传入一个<k,v>调用一次方法
* @param key
* @param value
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line =value.toString();
String[] words =line.split(" ");
for (String word:words
) {
//使用mr程序中的上下文context 把mapper阶段处理的数据发送出去作为reduce阶段的输入数据
context.write(new Text(word),new IntWritable(1));
}
}
}
Reduce阶段
package hp.wordsCount;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* reducer 处理阶段
*
* keyIn Reducer阶段输入的数据key类型,对应mapper的输出key类型 单词Text
*
* value1 对应mapper阶段输出的value类型 intWritable
*
* keyout 对应reducer输出 在这里为单词 TEXT
*
* value2 对应输出 在这里为 次数 IntWritable
*/
public class WordCountReducer extends Reducer<Text,IntWritable,Text,IntWritable>{
/**
* 重写reduce方法
* reducer阶段具体业务的实现方法
* @param key
* @param values
* @param context
* @throws IOException
* @throws InterruptedException
*
* reduce接受所有来自map阶段处理的数据,按照key的字典序进行排序
*/
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count =0;
for (IntWritable v:values
) {
count+=v.get();
}
context.write(key,new IntWritable(count));
}
}
Driver运行主类
package hp.wordsCount;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* @program: hadoop
* mr程序运行时候的主类,组装了一些程序运行时候所需的信息
* 如:使用哪个mapper类 哪个reducer 类
*/
public class WordCountDriver {
public static void main(String[] args) throws Exception {
//通过job来封装本次mr的相关信息
Configuration conf=new Configuration();
Job job = Job.getInstance(conf);
//指定本次mr job jar 运行主类
job.setJarByClass(WordCountDriver.class);
//指定所用mapper reducer
job.setMapperClass(WordsCountMapper.class);
job.setReducerClass(WordCountReducer.class);
//指定mapper阶段输出 kv 类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//指定最终 阶段输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//指定本次 mr 输入数据路径 和输出结果存放位置
FileInputFormat.setInputPaths(job,"/wordcount/input");
FileOutputFormat.setOutputPath(job,new Path("/wordcount/output"));
boolean b = job.waitForCompletion(true);
System.exit(b?0:1);
}
}
打包注意用assembly插件 将依赖一同打包。否则运行时报Exception in thread "main" java.lang.ClassNotFoundException:错误
本地运行模式
将conf 配置
conf.set("mapreduce.framework.name","local");