大数据系列(4)-手写MapperReduce程序

本文通过实践Java 程序,借助之前搭建的Hadoop 平台,实现单词统计的能力。

1. MR 执行流程借老外的图,来说明一下:

  • Input 为输入
  • Output为最终结果输出
  • 对Input按大小进行拆分,然后对拆分后的部分进行逻辑处理(这里是统计单词出现次数)
  • Map处理后进行Shuttle排序操作
  • 如果有Reduce任务,进行Reduce逻辑处理(这里指归类聚合)
  • 最后统一输出

(input) <k1, v1> -> map -> <k2, v2> -> combine -> <k2, v2> -> reduce -> <k3, v3> (output)

2. 代码编写

按照MapReduce编程规范,分别编写MapperReducerDriver

2.1 编写Mapper

package com.angus.hadoop.wordcount.m;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * @Author anguszhu
 * @Description  继承Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>
 * @Date  2022/4/6 下午4:48
 */
public class WordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    //输入Key
    Text k = new Text();
    /*常用数据序列化类型
    Java类型	Hadoop Writable类型
    Boolean	BooleanWritable
    Byte	ByteWritable
    Int	IntWritable
    Float	FloatWritable
    Long	LongWritable
    Double	DoubleWritable
    String	Text
    Map	MapWritable
    Array	ArrayWritable
    Null	NullWritable*/
    //  输出 Value  IntWritable 为Hadoop 的序列化类型
    IntWritable v = new IntWritable(1);

    //根据每行内容,进行逻辑处理
    /**
      * @Author anguszhu
      * @Description //TODO
      * @Date  2022/4/6 下午6:22
      * @param key
      * @param value
      * @param context
      * @Return void
      */
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        // 1 获取一行
        String line = value.toString();
        // 2 空格切割
        String[] words = line.split(" ");
        // 3 输出 字符串为Key【文本格式】, write 方法为输出格式(Text,intWriteable),作为reduce 的Input(K,V)
        for (String word : words) {
            k.set(word);
            context.write(k, v);
        }
    }
}

2.2 编写Reducer

package com.angus.hadoop.wordcount.r;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * @Author anguszhu
 * @Date 2022/4/6 下午6:31
 * @Description
 */
public class WordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
	//每个MR任务的单词求和
    int sum;
    //输出类型
    IntWritable v = new IntWritable();

    /**
      * @Author anguszhu
      * @Date  2022/4/6 下午6:33
      * @param key Mapper的输出key,为这里的输入key
      * @param values Mapper的输出Value,为这里的输入Values
      * @param context
      * @Return void
      * @Description  把Mapper后的数据,按照逻辑统计后,输出
      */
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

        // 1 累加求和
        sum = 0;
        for (IntWritable count : values) {
            sum += count.get();
        }

        // 2 输出
        v.set(sum);
        context.write(key, v);
    }
}

2.3 编写Driver

package com.angus.hadoop.wordcount.driver;

import com.angus.hadoop.wordcount.m.WordcountMapper;
import com.angus.hadoop.wordcount.r.WordcountReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class WordcountDriver {

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

		// 1 获取配置信息以及获取job对象
		Configuration configuration = new Configuration();
		Job job = Job.getInstance(configuration);

		// 2 关联本Driver程序的jar
		job.setJarByClass(WordcountDriver.class);

		// 3 关联Mapper和Reducer的jar
		job.setMapperClass(WordcountMapper.class);

		job.setReducerClass(WordcountReducer.class);

		// 4 设置Mapper输出的kv类型
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);

		// 5 设置最终输出kv类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		// 6 设置输入和输出路径
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		// 7 提交job
		boolean result = job.waitForCompletion(true);
		System.exit(result ? 0 : 1);
	}
}

2.4  Pom 打包jar

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <parent>
        <artifactId>p4j</artifactId>
        <groupId>org.angus</groupId>
        <version>1.0-SNAPSHOT</version>
    </parent>
    <modelVersion>4.0.0</modelVersion>
    <artifactId>p4j-bigdata</artifactId>
    <dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>3.3.1</version>
        </dependency>

        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
        </dependency>
        <dependency>
            <groupId>org.apache.logging.log4j</groupId>
            <artifactId>log4j-slf4j-impl</artifactId>
            <version>2.12.0</version>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <plugin>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.6.1</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                </configuration>
            </plugin>
            <plugin>
                <artifactId>maven-assembly-plugin</artifactId>
                <configuration>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

</project>

2.5 Hadoop 集群执行

Maven install后打出jar 包:p4j-bigdata-1.0-SNAPSHOT.jar

拷贝到Hadoop 集群上

执行命令:hadoop jar p4j-bigdata-1.0-SNAPSHOT.jar com.angus.hadoop.wordcount.driver.WordcountDriver /input  /output

其中:Hdfs的目录/input 下存放需要分析的文件、/output为 结果存储目录

[hadoop@leader ~]$ hadoop jar p4j-bigdata-1.0-SNAPSHOT.jar com.angus.hadoop.wordcount.driver.WordcountDriver /input /output
2022-04-07 15:21:36,284 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at follower2/192.168.56.103:8032
2022-04-07 15:21:36,648 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
2022-04-07 15:21:36,661 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/hadoop/.staging/job_1649313174092_0002
2022-04-07 15:21:36,923 INFO input.FileInputFormat: Total input files to process : 1
2022-04-07 15:21:37,010 INFO mapreduce.JobSubmitter: number of splits:1
2022-04-07 15:21:37,160 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1649313174092_0002
2022-04-07 15:21:37,160 INFO mapreduce.JobSubmitter: Executing with tokens: []
2022-04-07 15:21:37,331 INFO conf.Configuration: resource-types.xml not found
2022-04-07 15:21:37,331 INFO resource.ResourceUtils: Unable to find 'resource-types.xml'.
2022-04-07 15:21:37,773 INFO impl.YarnClientImpl: Submitted application application_1649313174092_0002
2022-04-07 15:21:37,817 INFO mapreduce.Job: The url to track the job: http://follower2:8088/proxy/application_1649313174092_0002/
2022-04-07 15:21:37,821 INFO mapreduce.Job: Running job: job_1649313174092_0002
2022-04-07 15:21:51,064 INFO mapreduce.Job: Job job_1649313174092_0002 running in uber mode : false
2022-04-07 15:21:51,065 INFO mapreduce.Job:  map 0% reduce 0%
2022-04-07 15:21:57,229 INFO mapreduce.Job:  map 100% reduce 0%
2022-04-07 15:22:04,365 INFO mapreduce.Job:  map 100% reduce 100%
2022-04-07 15:22:04,376 INFO mapreduce.Job: Job job_1649313174092_0002 completed successfully
2022-04-07 15:22:04,496 INFO mapreduce.Job: Counters: 54
	File System Counters
		FILE: Number of bytes read=624
		FILE: Number of bytes written=546411
		FILE: Number of read operations=0
		FILE: Number of large read operations=0
		FILE: Number of write operations=0
		HDFS: Number of bytes read=414
		HDFS: Number of bytes written=59
		HDFS: Number of read operations=8
		HDFS: Number of large read operations=0
		HDFS: Number of write operations=2
		HDFS: Number of bytes read erasure-coded=0
	Job Counters
		Launched map tasks=1
		Launched reduce tasks=1
		Data-local map tasks=1
		Total time spent by all maps in occupied slots (ms)=5912
		Total time spent by all reduces in occupied slots (ms)=9618
		Total time spent by all map tasks (ms)=2956
		Total time spent by all reduce tasks (ms)=4809
		Total vcore-milliseconds taken by all map tasks=2956
		Total vcore-milliseconds taken by all reduce tasks=4809
		Total megabyte-milliseconds taken by all map tasks=3026944
		Total megabyte-milliseconds taken by all reduce tasks=4924416
	Map-Reduce Framework
		Map input records=26
		Map output records=52
		Map output bytes=514
		Map output materialized bytes=624
		Input split bytes=108
		Combine input records=0
		Combine output records=0
		Reduce input groups=7
		Reduce shuffle bytes=624
		Reduce input records=52
		Reduce output records=7
		Spilled Records=104
		Shuffled Maps =1
		Failed Shuffles=0
		Merged Map outputs=1
		GC time elapsed (ms)=169
		CPU time spent (ms)=880
		Physical memory (bytes) snapshot=257519616
		Virtual memory (bytes) snapshot=1978413056
		Total committed heap usage (bytes)=137498624
		Peak Map Physical memory (bytes)=178524160
		Peak Map Virtual memory (bytes)=987774976
		Peak Reduce Physical memory (bytes)=78995456
		Peak Reduce Virtual memory (bytes)=990638080
	Shuffle Errors
		BAD_ID=0
		CONNECTION=0
		IO_ERROR=0
		WRONG_LENGTH=0
		WRONG_MAP=0
		WRONG_REDUCE=0
	File Input Format Counters
		Bytes Read=306
	File Output Format Counters
		Bytes Written=59

2.6 执行结果

3. 本地向Hadoop 提交任务

3.1 修改Driver

package com.angus.hadoop.wordcount.driver;

import com.angus.hadoop.wordcount.m.WordcountMapper;
import com.angus.hadoop.wordcount.r.WordcountReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class WordcountDriver {

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

		// 1 获取配置信息以及获取job对象
		Configuration configuration = new Configuration();

		// 向集群提交
		//设置HDFS NameNode的地址
		configuration.set("fs.defaultFS", "hdfs://leader:9820");
		// 指定MapReduce运行在Yarn上
		configuration.set("mapreduce.framework.name","yarn");
		// 指定mapreduce可以在远程集群运行
		configuration.set("mapreduce.app-submission.cross-platform","true");
		//指定Yarn resourcemanager的位置
		configuration.set("yarn.resourcemanager.hostname","follower2");



		Job job = Job.getInstance(configuration);

		// 2 关联本Driver程序的jar
		job.setJarByClass(WordcountDriver.class);
		// 本地提交执行
		job.setJar("/opt/tech/git/p4j/p4j-bigdata/target/p4j-bigdata-1.0-SNAPSHOT.jar");

		// 3 关联Mapper和Reducer的jar
		job.setMapperClass(WordcountMapper.class);

		job.setReducerClass(WordcountReducer.class);

		// 4 设置Mapper输出的kv类型
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);

		// 5 设置最终输出kv类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		// 6 设置输入和输出路径
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		// 7 提交job
		boolean result = job.waitForCompletion(true);
		System.exit(result ? 0 : 1);
	}
}

3.2 修改启动参数

 3.3 执行结果

 

至此完成了一个Demo,手写WordCount MR.

  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值