大数据系列(4)-手写MapperReduce程序

最新推荐文章于 2024-07-25 16:47:15 发布

zhm6422107

最新推荐文章于 2024-07-25 16:47:15 发布

阅读量2.5k

点赞数

分类专栏：大数据文章标签： hadoop

本文链接：https://blog.csdn.net/zhm6422107/article/details/123990123

版权

大数据专栏收录该内容

4 篇文章 0 订阅

订阅专栏

本文通过实践Java 程序，借助之前搭建的Hadoop 平台，实现单词统计的能力。

1. MR 执行流程借老外的图，来说明一下：

Input 为输入
Output为最终结果输出
对Input按大小进行拆分，然后对拆分后的部分进行逻辑处理（这里是统计单词出现次数）
Map处理后进行Shuttle排序操作
如果有Reduce任务，进行Reduce逻辑处理（这里指归类聚合）
最后统一输出

(input) <k1, v1> -> map -> <k2, v2> -> combine -> <k2, v2> -> reduce -> <k3, v3> (output)

2. 代码编写

按照MapReduce编程规范，分别编写Mapper，Reducer，Driver。

2.1 编写Mapper

package com.angus.hadoop.wordcount.m;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * @Author anguszhu
 * @Description  继承Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>
 * @Date  2022/4/6 下午4:48
 */
public class WordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    //输入Key
    Text k = new Text();
    /*常用数据序列化类型
    Java类型	Hadoop Writable类型
    Boolean	BooleanWritable
    Byte	ByteWritable
    Int	IntWritable
    Float	FloatWritable
    Long	LongWritable
    Double	DoubleWritable
    String	Text
    Map	MapWritable
    Array	ArrayWritable
    Null	NullWritable*/
    //  输出 Value  IntWritable 为Hadoop 的序列化类型
    IntWritable v = new IntWritable(1);

    //根据每行内容，进行逻辑处理
    /**
      * @Author anguszhu
      * @Description //TODO
      * @Date  2022/4/6 下午6:22
      * @param key
      * @param value
      * @param context
      * @Return void
      */
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        // 1 获取一行
        String line = value.toString();
        // 2 空格切割
        String[] words = line.split(" ");
        // 3 输出 字符串为Key【文本格式】, write 方法为输出格式（Text，intWriteable），作为reduce 的Input（K,V)
        for (String word : words) {
            k.set(word);
            context.write(k, v);
        }
    }
}

2.2 编写Reducer

package com.angus.hadoop.wordcount.r;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * @Author anguszhu
 * @Date 2022/4/6 下午6:31
 * @Description
 */
public class WordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
	//每个MR任务的单词求和
    int sum;
    //输出类型
    IntWritable v = new IntWritable();

    /**
      * @Author anguszhu
      * @Date  2022/4/6 下午6:33
      * @param key Mapper的输出key,为这里的输入key
      * @param values Mapper的输出Value，为这里的输入Values
      * @param context
      * @Return void
      * @Description  把Mapper后的数据，按照逻辑统计后，输出
      */
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

        // 1 累加求和
        sum = 0;
        for (IntWritable count : values) {
            sum += count.get();
        }

        // 2 输出
        v.set(sum);
        context.write(key, v);
    }
}

2.3 编写Driver

package com.angus.hadoop.wordcount.driver;

import com.angus.hadoop.wordcount.m.WordcountMapper;
import com.angus.hadoop.wordcount.r.WordcountReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class WordcountDriver {

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

		// 1 获取配置信息以及获取job对象
		Configuration configuration = new Configuration();
		Job job = Job.getInstance(configuration);

		// 2 关联本Driver程序的jar
		job.setJarByClass(WordcountDriver.class);

		// 3 关联Mapper和Reducer的jar
		job.setMapperClass(WordcountMapper.class);

		job.setReducerClass(WordcountReducer.class);

		// 4 设置Mapper输出的kv类型
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);

		// 5 设置最终输出kv类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		// 6 设置输入和输出路径
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		// 7 提交job
		boolean result = job.waitForCompletion(true);
		System.exit(result ? 0 : 1);
	}
}

2.4 Pom 打包jar

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <parent>
        <artifactId>p4j</artifactId>
        <groupId>org.angus</groupId>
        <version>1.0-SNAPSHOT</version>
    </parent>
    <modelVersion>4.0.0</modelVersion>
    <artifactId>p4j-bigdata</artifactId>
    <dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>3.3.1</version>
        </dependency>

        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
        </dependency>
        <dependency>
            <groupId>org.apache.logging.log4j</groupId>
            <artifactId>log4j-slf4j-impl</artifactId>
            <version>2.12.0</version>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <plugin>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.6.1</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                </configuration>
            </plugin>
            <plugin>
                <artifactId>maven-assembly-plugin</artifactId>
                <configuration>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

</project>

2.5 Hadoop 集群执行

Maven install后打出jar 包：p4j-bigdata-1.0-SNAPSHOT.jar

拷贝到Hadoop 集群上

执行命令：hadoop jar p4j-bigdata-1.0-SNAPSHOT.jar com.angus.hadoop.wordcount.driver.WordcountDriver /input /output

其中：Hdfs的目录/input 下存放需要分析的文件、/output为结果存储目录

[hadoop@leader ~]$ hadoop jar p4j-bigdata-1.0-SNAPSHOT.jar com.angus.hadoop.wordcount.driver.WordcountDriver /input /output
2022-04-07 15:21:36,284 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at follower2/192.168.56.103:8032
2022-04-07 15:21:36,648 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
2022-04-07 15:21:36,661 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/hadoop/.staging/job_1649313174092_0002
2022-04-07 15:21:36,923 INFO input.FileInputFormat: Total input files to process : 1
2022-04-07 15:21:37,010 INFO mapreduce.JobSubmitter: number of splits:1
2022-04-07 15:21:37,160 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1649313174092_0002
2022-04-07 15:21:37,160 INFO mapreduce.JobSubmitter: Executing with tokens: []
2022-04-07 15:21:37,331 INFO conf.Configuration: resource-types.xml not found
2022-04-07 15:21:37,331 INFO resource.ResourceUtils: Unable to find 'resource-types.xml'.
2022-04-07 15:21:37,773 INFO impl.YarnClientImpl: Submitted application application_1649313174092_0002
2022-04-07 15:21:37,817 INFO mapreduce.Job: The url to track the job: http://follower2:8088/proxy/application_1649313174092_0002/
2022-04-07 15:21:37,821 INFO mapreduce.Job: Running job: job_1649313174092_0002
2022-04-07 15:21:51,064 INFO mapreduce.Job: Job job_1649313174092_0002 running in uber mode : false
2022-04-07 15:21:51,065 INFO mapreduce.Job:  map 0% reduce 0%
2022-04-07 15:21:57,229 INFO mapreduce.Job:  map 100% reduce 0%
2022-04-07 15:22:04,365 INFO mapreduce.Job:  map 100% reduce 100%
2022-04-07 15:22:04,376 INFO mapreduce.Job: Job job_1649313174092_0002 completed successfully
2022-04-07 15:22:04,496 INFO mapreduce.Job: Counters: 54
	File System Counters
		FILE: Number of bytes read=624
		FILE: Number of bytes written=546411
		FILE: Number of read operations=0
		FILE: Number of large read operations=0
		FILE: Number of write operations=0
		HDFS: Number of bytes read=414
		HDFS: Number of bytes written=59
		HDFS: Number of read operations=8
		HDFS: Number of large read operations=0
		HDFS: Number of write operations=2
		HDFS: Number of bytes read erasure-coded=0
	Job Counters
		Launched map tasks=1
		Launched reduce tasks=1
		Data-local map tasks=1
		Total time spent by all maps in occupied slots (ms)=5912
		Total time spent by all reduces in occupied slots (ms)=9618
		Total time spent by all map tasks (ms)=2956
		Total time spent by all reduce tasks (ms)=4809
		Total vcore-milliseconds taken by all map tasks=2956
		Total vcore-milliseconds taken by all reduce tasks=4809
		Total megabyte-milliseconds taken by all map tasks=3026944
		Total megabyte-milliseconds taken by all reduce tasks=4924416
	Map-Reduce Framework
		Map input records=26
		Map output records=52
		Map output bytes=514
		Map output materialized bytes=624
		Input split bytes=108
		Combine input records=0
		Combine output records=0
		Reduce input groups=7
		Reduce shuffle bytes=624
		Reduce input records=52
		Reduce output records=7
		Spilled Records=104
		Shuffled Maps =1
		Failed Shuffles=0
		Merged Map outputs=1
		GC time elapsed (ms)=169
		CPU time spent (ms)=880
		Physical memory (bytes) snapshot=257519616
		Virtual memory (bytes) snapshot=1978413056
		Total committed heap usage (bytes)=137498624
		Peak Map Physical memory (bytes)=178524160
		Peak Map Virtual memory (bytes)=987774976
		Peak Reduce Physical memory (bytes)=78995456
		Peak Reduce Virtual memory (bytes)=990638080
	Shuffle Errors
		BAD_ID=0
		CONNECTION=0
		IO_ERROR=0
		WRONG_LENGTH=0
		WRONG_MAP=0
		WRONG_REDUCE=0
	File Input Format Counters
		Bytes Read=306
	File Output Format Counters
		Bytes Written=59

2.6 执行结果

3. 本地向Hadoop 提交任务

3.1 修改Driver

package com.angus.hadoop.wordcount.driver;

import com.angus.hadoop.wordcount.m.WordcountMapper;
import com.angus.hadoop.wordcount.r.WordcountReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class WordcountDriver {

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

		// 1 获取配置信息以及获取job对象
		Configuration configuration = new Configuration();

		// 向集群提交
		//设置HDFS NameNode的地址
		configuration.set("fs.defaultFS", "hdfs://leader:9820");
		// 指定MapReduce运行在Yarn上
		configuration.set("mapreduce.framework.name","yarn");
		// 指定mapreduce可以在远程集群运行
		configuration.set("mapreduce.app-submission.cross-platform","true");
		//指定Yarn resourcemanager的位置
		configuration.set("yarn.resourcemanager.hostname","follower2");



		Job job = Job.getInstance(configuration);

		// 2 关联本Driver程序的jar
		job.setJarByClass(WordcountDriver.class);
		// 本地提交执行
		job.setJar("/opt/tech/git/p4j/p4j-bigdata/target/p4j-bigdata-1.0-SNAPSHOT.jar");

		// 3 关联Mapper和Reducer的jar
		job.setMapperClass(WordcountMapper.class);

		job.setReducerClass(WordcountReducer.class);

		// 4 设置Mapper输出的kv类型
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);

		// 5 设置最终输出kv类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		// 6 设置输入和输出路径
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		// 7 提交job
		boolean result = job.waitForCompletion(true);
		System.exit(result ? 0 : 1);
	}
}

3.2 修改启动参数

3.3 执行结果

至此完成了一个Demo，手写WordCount MR.

zhm6422107

关注

0
点赞
踩
6

收藏

觉得还不错? 一键收藏
0
评论
大数据系列(4)-手写MapperReduce程序

本文通过实践Java 程序，借助之前搭建的Hadoop 平台，实现单词统计的能力。1. MR 执行流程借老外的图，来说明一下：Input 为输入 Output为最终结果输出对Input按大小进行拆分，然后对拆分后的部分进行逻辑处理（这里是统计单词出现次数） Map处理后进行Shuttle排序操作如果有Reduce任务，进行Reduce逻辑处理（这里指归类聚合）最后统一输出(input) <k1, v1> -> map -> <k2, v2>
复制链接

扫一扫

专栏目录