1. 准备工作
1.1 远程文件准备
(1) 创建数据文件
[hadoop@hadoop181 ~]$ mkdir mapreduce
[hadoop@hadoop181 ~]$ cd mapreduce/
# 创建文件
[hadoop@hadoop181 mapreduce]$ vim wordcount.txt
# 新增文件内容
[hadoop@hadoop181 mapreduce]$ cat wordcount.txt
hello,world,hadoop
hive,sqoop,flume,hello
kitty,tom,jerry,world
hadoop
[hadoop@hadoop181 mapreduce]$
(2) 将数据文件上传hdfs
[hadoop@hadoop181 mapreduce]$ hdfs dfs -mkdir -p /wordcount/
[hadoop@hadoop181 mapreduce]$ hdfs dfs -put wordcount.txt /wordcount/
1.2 本地文件准备
(1) 项目路径下创建文件
(2)文件内容
hello,world,hadoop
hive,sqoop,flume,hello
kitty,tom,jerry,world
hadoop
2. 创建模块
(1) 创建模块mapreduce-core ,pom文件以来信息如下
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j-impl</artifactId>
<version>2.12.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<minimizeJar>true</minimizeJar>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
(2)创建3个类文件 JobMain
任务启动类,Mapper
实现类,Reducer
实现类
3. 类实现
3.1 Mapper 类实现
package cn.learn.bigdata.mapreduce;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* WordCount 案例演示 Mapper 实现,这个类需要重写map方法,主要做 k1,v2 ---> k2 , v2 的转换
*/
public class WordCountMapper extends Mapper<LongWritable, Text,Text,LongWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
Text text = new Text();
LongWritable longWritable = new LongWritable();
// 分割字符串
String[] split = value.toString().split(",");
for (String word : split) {
text.set(word);
longWritable.set(1);
context.write(text,longWritable);
}
}
}
3.1 Reducer 类实现
package cn.learn.bigdata.mapreduce;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* WordCount 案例演示 Reducer 实现
*/
public class WordCountReducer extends Reducer<Text, LongWritable, Text,LongWritable> {
/**
*
* @param key 新K2
* @param values 新V2 (集合)
* @param context 表示上下文对象
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
long count = 0L;
for (LongWritable value : values) {
count += value.get();
}
context.write(key,new LongWritable(count));
}
}
3.3 任务启动类实现
这个启动类有两种方式, 可以用
ToolRunner.run(new Configuration(), new JobMain(), args);
就是我写的JobMain
启动, 或者直接定义Driver,实现main方法类启动
(1) 方式1 (JobMain)实现
package cn.learn.bigdata.mapreduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* 执行任务主类的实现
*/
public class JobMain extends Configured implements Tool {
@Override
public int run(String[] strings) throws Exception {
/* 获取 job 任务对象 */
Configuration conf = super.getConf();
Job job = Job.getInstance(conf,"wordCount");
// 给集群用的, 如果是本地执行,不配置也可以,但是集群一定要设置
job.setJarByClass(JobMain.class);
/* 配置job 任务对象 */
// 1. 指定文件读取方式和位置 New Path("可以是远程路径 hdfs://")
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("input/wordcount.txt"));
// 2. 指定Map阶段的处理方式 (K2 V2)
job.setMapperClass(WordCountMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
// 3. 指定Reduce阶段的处理方式(K3,V3)
job.setReducerClass(WordCountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
// 4. 设置输出类型,New Path("可以是远程路径 hdfs://")
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job,new Path("output"));
/* 提交任务,等待任务结束 */
return job.waitForCompletion(true)? 0:1;
}
public static void main(String[] args) throws Exception {
int run = ToolRunner.run(new Configuration(), new JobMain(), args);
System.exit(run);
}
}
(2) 方式2 Driver 实现
package cn.learn.bigdata.mapreduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
/**
* Driver 实现
*/
public class Driver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
/* 获取 job 任务对象 */
Configuration conf = new Configuration();
Job job = Job.getInstance(conf,"wordCount");
// 给集群用的, 如果是本地执行,不配置也可以,但是集群一定要设置
job.setJarByClass(Driver.class);
/* 配置job 任务对象 */
// 1. 指定文件读取方式和位置
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("input/wordcount.txt"));
// 2. 指定Map阶段的处理方式 (K2 V2)
job.setMapperClass(WordCountMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
// 3. 指定Reduce阶段的处理方式(K3,V3)
job.setReducerClass(WordCountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
// 4. 设置输出类型
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job,new Path("output2"));
/* 提交任务,等待任务结束 */
boolean b = job.waitForCompletion(true);
System.exit(b? 1 : 0);
}
}
4. 本地测试
4.1 启动类JobMain测试
(1)生成文件位置
(2) 计算结果
4.1 启动类Driver测试
(1) 生成文件位置
(2) 生成文件内容
5. 远程测试
5.1 远程测试准备
(1) 修改JobMain类中输入输出路径为hdfs路径
// 1. 指定文件读取方式和位置
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("hdfs://hadoop181:9000/wordcount/wordcount.txt"));
// 8. 设置输出类型和路径
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job,new Path("hdfs://hadoop181:9000/wcoutput1"));
(2) 修改Driver类中输入输出路径为hdfs路径
// 1. 指定文件读取方式和位置
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("hdfs://hadoop181:9000/wordcount/wordcount.txt"));
// 8. 设置输出类型和路径
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job,new Path("hdfs://hadoop181:9000/wcoutput2"));
(3)打jar包,将打好的jar包上传到服务其任意路径(我这里上传到了$USER_HOME/路径)
5.2 测试JobMain启动
(1)启动命令
[hadoop@hadoop181 ~]$ hadoop jar original-mapreduce-core-1.0-SNAPSHOT.jar cn.learn.bigdata.mapreduce.JobMain
(2)测试结果
# 检查生成的文件
[hadoop@hadoop181 ~]$ hdfs dfs -ls -R /wcoutput1
# 显示文本文件内容
[hadoop@hadoop181 ~]$ hdfs dfs -text /wcoutput1/part-r-00000
(3) 效果图
5.3 测试 Driver 启动
(1)启动命令
[hadoop@hadoop181 ~]$ hadoop jar original-mapreduce-core-1.0-SNAPSHOT.jar cn.learn.bigdata.mapreduce.Driver
(2)测试结果
# 检查生成的文件
[hadoop@hadoop181 ~]$ hdfs dfs -ls -R /wcoutput2
# 显示文本文件内容
[hadoop@hadoop181 ~]$ hdfs dfs -text /wcoutput2/part-r-00000
(3) 效果图