本文通过实践Java 程序,借助之前搭建的Hadoop 平台,实现单词统计的能力。
1. MR 执行流程
借老外的图,来说明一下:
- Input 为输入
- Output为最终结果输出
- 对Input按大小进行拆分,然后对拆分后的部分进行逻辑处理(这里是统计单词出现次数)
- Map处理后进行Shuttle排序操作
- 如果有Reduce任务,进行Reduce逻辑处理(这里指归类聚合)
- 最后统一输出
(input) <k1, v1> -> map -> <k2, v2> -> combine -> <k2, v2> -> reduce -> <k3, v3> (output)
2. 代码编写
按照MapReduce编程规范,分别编写Mapper,Reducer,Driver。
2.1 编写Mapper
package com.angus.hadoop.wordcount.m;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* @Author anguszhu
* @Description 继承Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>
* @Date 2022/4/6 下午4:48
*/
public class WordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
//输入Key
Text k = new Text();
/*常用数据序列化类型
Java类型 Hadoop Writable类型
Boolean BooleanWritable
Byte ByteWritable
Int IntWritable
Float FloatWritable
Long LongWritable
Double DoubleWritable
String Text
Map MapWritable
Array ArrayWritable
Null NullWritable*/
// 输出 Value IntWritable 为Hadoop 的序列化类型
IntWritable v = new IntWritable(1);
//根据每行内容,进行逻辑处理
/**
* @Author anguszhu
* @Description //TODO
* @Date 2022/4/6 下午6:22
* @param key
* @param value
* @param context
* @Return void
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 1 获取一行
String line = value.toString();
// 2 空格切割
String[] words = line.split(" ");
// 3 输出 字符串为Key【文本格式】, write 方法为输出格式(Text,intWriteable),作为reduce 的Input(K,V)
for (String word : words) {
k.set(word);
context.write(k, v);
}
}
}
2.2 编写Reducer
package com.angus.hadoop.wordcount.r;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* @Author anguszhu
* @Date 2022/4/6 下午6:31
* @Description
*/
public class WordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
//每个MR任务的单词求和
int sum;
//输出类型
IntWritable v = new IntWritable();
/**
* @Author anguszhu
* @Date 2022/4/6 下午6:33
* @param key Mapper的输出key,为这里的输入key
* @param values Mapper的输出Value,为这里的输入Values
* @param context
* @Return void
* @Description 把Mapper后的数据,按照逻辑统计后,输出
*/
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
// 1 累加求和
sum = 0;
for (IntWritable count : values) {
sum += count.get();
}
// 2 输出
v.set(sum);
context.write(key, v);
}
}
2.3 编写Driver
package com.angus.hadoop.wordcount.driver;
import com.angus.hadoop.wordcount.m.WordcountMapper;
import com.angus.hadoop.wordcount.r.WordcountReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class WordcountDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
// 1 获取配置信息以及获取job对象
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
// 2 关联本Driver程序的jar
job.setJarByClass(WordcountDriver.class);
// 3 关联Mapper和Reducer的jar
job.setMapperClass(WordcountMapper.class);
job.setReducerClass(WordcountReducer.class);
// 4 设置Mapper输出的kv类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 5 设置最终输出kv类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 6 设置输入和输出路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 7 提交job
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}
2.4 Pom 打包jar
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>p4j</artifactId>
<groupId>org.angus</groupId>
<version>1.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>p4j-bigdata</artifactId>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.3.1</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j-impl</artifactId>
<version>2.12.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.6.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
2.5 Hadoop 集群执行
Maven install后打出jar 包:p4j-bigdata-1.0-SNAPSHOT.jar
拷贝到Hadoop 集群上
执行命令:hadoop jar p4j-bigdata-1.0-SNAPSHOT.jar com.angus.hadoop.wordcount.driver.WordcountDriver /input /output
其中:Hdfs的目录/input 下存放需要分析的文件、/output为 结果存储目录
[hadoop@leader ~]$ hadoop jar p4j-bigdata-1.0-SNAPSHOT.jar com.angus.hadoop.wordcount.driver.WordcountDriver /input /output
2022-04-07 15:21:36,284 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at follower2/192.168.56.103:8032
2022-04-07 15:21:36,648 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
2022-04-07 15:21:36,661 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/hadoop/.staging/job_1649313174092_0002
2022-04-07 15:21:36,923 INFO input.FileInputFormat: Total input files to process : 1
2022-04-07 15:21:37,010 INFO mapreduce.JobSubmitter: number of splits:1
2022-04-07 15:21:37,160 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1649313174092_0002
2022-04-07 15:21:37,160 INFO mapreduce.JobSubmitter: Executing with tokens: []
2022-04-07 15:21:37,331 INFO conf.Configuration: resource-types.xml not found
2022-04-07 15:21:37,331 INFO resource.ResourceUtils: Unable to find 'resource-types.xml'.
2022-04-07 15:21:37,773 INFO impl.YarnClientImpl: Submitted application application_1649313174092_0002
2022-04-07 15:21:37,817 INFO mapreduce.Job: The url to track the job: http://follower2:8088/proxy/application_1649313174092_0002/
2022-04-07 15:21:37,821 INFO mapreduce.Job: Running job: job_1649313174092_0002
2022-04-07 15:21:51,064 INFO mapreduce.Job: Job job_1649313174092_0002 running in uber mode : false
2022-04-07 15:21:51,065 INFO mapreduce.Job: map 0% reduce 0%
2022-04-07 15:21:57,229 INFO mapreduce.Job: map 100% reduce 0%
2022-04-07 15:22:04,365 INFO mapreduce.Job: map 100% reduce 100%
2022-04-07 15:22:04,376 INFO mapreduce.Job: Job job_1649313174092_0002 completed successfully
2022-04-07 15:22:04,496 INFO mapreduce.Job: Counters: 54
File System Counters
FILE: Number of bytes read=624
FILE: Number of bytes written=546411
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=414
HDFS: Number of bytes written=59
HDFS: Number of read operations=8
HDFS: Number of large read operations=0
HDFS: Number of write operations=2
HDFS: Number of bytes read erasure-coded=0
Job Counters
Launched map tasks=1
Launched reduce tasks=1
Data-local map tasks=1
Total time spent by all maps in occupied slots (ms)=5912
Total time spent by all reduces in occupied slots (ms)=9618
Total time spent by all map tasks (ms)=2956
Total time spent by all reduce tasks (ms)=4809
Total vcore-milliseconds taken by all map tasks=2956
Total vcore-milliseconds taken by all reduce tasks=4809
Total megabyte-milliseconds taken by all map tasks=3026944
Total megabyte-milliseconds taken by all reduce tasks=4924416
Map-Reduce Framework
Map input records=26
Map output records=52
Map output bytes=514
Map output materialized bytes=624
Input split bytes=108
Combine input records=0
Combine output records=0
Reduce input groups=7
Reduce shuffle bytes=624
Reduce input records=52
Reduce output records=7
Spilled Records=104
Shuffled Maps =1
Failed Shuffles=0
Merged Map outputs=1
GC time elapsed (ms)=169
CPU time spent (ms)=880
Physical memory (bytes) snapshot=257519616
Virtual memory (bytes) snapshot=1978413056
Total committed heap usage (bytes)=137498624
Peak Map Physical memory (bytes)=178524160
Peak Map Virtual memory (bytes)=987774976
Peak Reduce Physical memory (bytes)=78995456
Peak Reduce Virtual memory (bytes)=990638080
Shuffle Errors
BAD_ID=0
CONNECTION=0
IO_ERROR=0
WRONG_LENGTH=0
WRONG_MAP=0
WRONG_REDUCE=0
File Input Format Counters
Bytes Read=306
File Output Format Counters
Bytes Written=59
2.6 执行结果
3. 本地向Hadoop 提交任务
3.1 修改Driver
package com.angus.hadoop.wordcount.driver;
import com.angus.hadoop.wordcount.m.WordcountMapper;
import com.angus.hadoop.wordcount.r.WordcountReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class WordcountDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
// 1 获取配置信息以及获取job对象
Configuration configuration = new Configuration();
// 向集群提交
//设置HDFS NameNode的地址
configuration.set("fs.defaultFS", "hdfs://leader:9820");
// 指定MapReduce运行在Yarn上
configuration.set("mapreduce.framework.name","yarn");
// 指定mapreduce可以在远程集群运行
configuration.set("mapreduce.app-submission.cross-platform","true");
//指定Yarn resourcemanager的位置
configuration.set("yarn.resourcemanager.hostname","follower2");
Job job = Job.getInstance(configuration);
// 2 关联本Driver程序的jar
job.setJarByClass(WordcountDriver.class);
// 本地提交执行
job.setJar("/opt/tech/git/p4j/p4j-bigdata/target/p4j-bigdata-1.0-SNAPSHOT.jar");
// 3 关联Mapper和Reducer的jar
job.setMapperClass(WordcountMapper.class);
job.setReducerClass(WordcountReducer.class);
// 4 设置Mapper输出的kv类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 5 设置最终输出kv类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 6 设置输入和输出路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 7 提交job
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}
3.2 修改启动参数
3.3 执行结果
至此完成了一个Demo,手写WordCount MR.