mr完整程序分三个阶段
0.准备
pom.xml引入jar包
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<spring.version>4.3.13.RELEASE</spring.version>
<hadoop.version>3.1.3</hadoop.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-context</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>3.0.2</version>
<configuration>
<archive>
<manifest>
<addClasspath>true</addClasspath>
<classpathPrefix>lib/</classpathPrefix>
<!-- 该处存放所需打包的类中,main方法所在的位置 -->
<mainClass>me.test.mapreduce.mr.WordCountDriver</mainClass>
</manifest>
</archive>
</configuration>
</plugin>
</plugins>
</build>
1.mapper阶段
package me.test.mapreduce.mr;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* 这里是mr程序mapper阶段业务逻辑实现类
* Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>
* KEYIN, VALUEIN : 输入的数据中,key和value的数据类型,本此案例中,输入的key是一个个单词,value是数字,因此key:Text value:IntWritable
* KEYOUT, VALUEOUT:输出的数据中,key和value的数据类型。将切割好的单词发送给reducer处理,key:Text value:IntWritable
* 平时所说的数据类型(string long integer)都是jdk自带的类型,在序列化时效率低下(因为不单单只是数据),因此hadoop有一套自己的数据类型
* long -Longwritable
* string-text
* Integer-Intwritable
* null-Nullwritable
*/
//public class WordCountMapper extends Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT> {
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
/**
* mapper节阶段具体实现方法,该方法调用取决于读取数据的组件有没有给mr传入数据,每当读取数据组件读取到一个《k,v》,该方法调用一次
* @param key
* @param value
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//拿到传进来啊的一行数据,将数据转化为string
String line = value.toString();
//切割此行内容,划分为一个个单词
String [] words = line.split(" ");
//遍历words,每个单词标记一次(<word,1>)
for (String word: words) {
//使用mr程序上下文context,将mapper阶段处理的数据发送出去,作为reduce阶段的处理数据
context.write(new Text(word), new IntWritable(1));
}
}
}
2.reducer阶段
package me.test.mapreduce.mr;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
*
* 这里是reducer处理阶段,接受从mapper阶段传送过来的数据
* 输入:单词:Text, 值:IntWritable
*/
public class WordCountReduce extends Reducer<Text, IntWritable,Text,IntWritable> {
/**
* 接受所有传进来的参数之后,按照key的字典序进行排序,之后按key为参数分组,对这一组所有的value作为一个迭代器传入reduce方法
*
* 大概数据流程:
* 传进数据:<hadoop,1><spark,1><hello,1><wordcount,1><hadoop,1><hadoop,1>
* 进行排序:<hadoop,1><hadoop,1><hello,1><sparl,1><wordcount,1><hadoop,1>
* 传入一组数据:<hadoop,1><hadoop,1><hadoop,1>
* reduce进行计算:<hadoop,[1,1,1]>
* 再次传入数据:<hello,1>
* 。。。。。。。。
*/
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
//定义一个计算器
int count = 0;
//遍历一组迭代器
for (IntWritable value: values) {
count = count + value.get();
}
context.write(key,new IntWritable(count));
}
}
3.运行主类
可手动编辑 输出输入路径
package me.test.mapreduce.mr;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* 这个类就是mr程序运行时主类,本类中组装了一些程序运行时所需要的信息
* 比如:输入数据在哪里,输出数据在哪里,使用的是 哪个mapper类,是哪个reducer类
*/
public class WordCountDriver extends Configured implements Tool {
public static String INPUT = "";
public static String OUTPUT = "";
@Override
public int run(String[] strings) throws Exception {
//配置输入输入路径
INPUT = strings[0];
OUTPUT = strings[1];
Configuration conf = new Configuration();
//判断输出路径是否存在
Path output = new Path(OUTPUT);
if (output.getFileSystem(conf).exists(output)){
output.getFileSystem(conf).delete(output,true);
System.out.println("输出路径存在,已自动删除。");
}
//通过job来封装本次mr相关信息
Job job = Job.getInstance(conf,WordCountDriver.class.getSimpleName());
//指定运行主类
job.setJarByClass(WordCountDriver.class);
FileInputFormat.setInputPaths(job,INPUT);
//指定mr所用的mapper与Reducer
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReduce.class);
//map输出阶段k,v类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//reducer输出k,v类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//指定本次mr输入文件和输出文件路径
//注意此处jar包的引用:org.apache.hadoop.mapreduce.lib.input/output.FileInputFormat;
FileInputFormat.setInputPaths(job,INPUT);
FileOutputFormat.setOutputPath(job,output);
boolean b = job.waitForCompletion(true);
System.exit(b?0:1);
return 0;
}
public static void main(String[] args) throws Exception {
ToolRunner.run(new WordCountDriver(), args);
}
}
4.打包上传,测试运行
上传至hdfs,测试运行
[root@master ~]# hadoop jar maven-hadoop-test1-2.0.jar /abcabcabc.txt /output
输出路径存在,已自动删除。
------------------
[root@master ~]# hadoop fs -ls -R /output
-rw-r--r-- 1 root supergroup 0 2020-03-23 20:42 /output/_SUCCESS
-rw-r--r-- 1 root supergroup 43 2020-03-23 20:42 /output/part-r-00000
[root@master ~]# hadoop fs -cat /output/part-r-00000
2020-03-23 20:43:24,182 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
aasasadadas 1
adasda 1
as 3
asd 3
d 3
sd 1
[root@master ~]#
5.本地模式运行mapreduce
Driver类中run方法修改如下:
Configuration conf = new Configuration();
conf.set("mapreduce.framework.name","local"); //默认值与当前修改值一样,可以不做声明。
idea导入jar包中默认配置如下: