Hadoop(五)

最新推荐文章于 2024-07-25 11:17:10 发布

究极章鱼

最新推荐文章于 2024-07-25 11:17:10 发布

阅读量26

点赞数

文章标签： hadoop

本文链接：https://blog.csdn.net/qq_50379483/article/details/133620760

版权

Hadoop_Day05

文章目录

WordCount代码

package org.hadoop.wordcount;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WCDriver {
    public static void main(String[] args) throws Exception{
        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);
        job.setJarByClass(WCDriver.class);
        job.setJobName("xljWC");
        job.setNumReduceTasks(1);

        job.setMapperClass(WCMap.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);

        job.setReducerClass(WCReduce.class);
        job.setOutputValueClass(Text.class);
        job.setOutputValueClass(LongWritable.class);

        Path path1 = new Path(args[0]);
        Path path2 = new Path(args[1]);

        FileSystem fileSystem = FileSystem.get(conf);
        boolean b = fileSystem.exists(path2);
        if (b) {
            fileSystem.delete(path2, true);
        }
        FileInputFormat.addInputPath(job,path1);
        FileOutputFormat.setOutputPath(job,path2);

        job.waitForCompletion(true);

    }
}

package org.hadoop.wordcount;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class WCMap extends Mapper<LongWritable, Text,Text,LongWritable> {
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException {
        String s = value.toString();
        String[] sArr = s.split(",");
        for (String s1 : sArr) {
            context.write(new Text(s1),new LongWritable(1L));
        }
    }
}

package org.hadoop.wordcount;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class WCReduce extends Reducer<Text, LongWritable,Text,LongWritable> {
    @Override
    protected void reduce(Text key, Iterable<LongWritable> values, Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
        long sum = 0;
        for (LongWritable value : values) {
            sum += value.get();
        }
        context.write(key,new LongWritable(sum));
    }
}

使用简化的WordCount查看偏移量

package org.hadoop.wordcount2;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WCDriver1 {
    public static void main(String[] args) throws Exception{
        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);
        job.setJarByClass(WCDriver1.class);
        job.setJobName("xljWC1");
        job.setNumReduceTasks(1);

        job.setMapperClass(WCMap1.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);

        job.setReducerClass(WCReduce1.class);
        job.setOutputValueClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        Path path1 = new Path(args[0]);
        Path path2 = new Path(args[1]);

        FileSystem fileSystem = FileSystem.get(conf);
        boolean b = fileSystem.exists(path2);
        if (b) {
            fileSystem.delete(path2, true);
        }
        FileInputFormat.addInputPath(job,path1);
        FileOutputFormat.setOutputPath(job,path2);

        job.waitForCompletion(true);

    }
}

package org.hadoop.wordcount2;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class WCMap1 extends Mapper<LongWritable, Text, Text, NullWritable> {
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {

        context.write(new Text("偏移量" + key + "\t内容" + value), NullWritable.get());
    }
}

package org.hadoop.wordcount2;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class WCReduce1 extends Reducer<Text, NullWritable, Text, NullWritable> {
    @Override
    protected void reduce(Text key, Iterable<NullWritable> values, Reducer<Text, NullWritable, Text, NullWritable>.Context context) throws IOException, InterruptedException {

        context.write(key, NullWritable.get());
    }
}

在这里插入图片描述

CRLF（windows默认格式\r\n）和LF（linux默认格式\r）下的偏移量有所不同

Yarn的工作机制

mapreduce提交程序到客户端所在的节点上去，yarnrunner向resourcemanager申请一个Application，resourcemanager将该应⽤程序的资源路径和application_id返回给yarnrunner，该程序将运行所需要的资源提交到hdfs上去，资源提交完毕之后申请运行MapreduceApplicationMaster
resourcemanager将⽤户的请求初始化成⼀个task，其中⼀个NodeManager领取到task任务，该NodeManager创建容器Container，并产⽣MapreduceApplicationMaster，Container从HDFS上拷⻉资源到本地，MapreduceApplicationMaster向resourcemanager申请运⾏maptask资源，RM将运⾏maptask任务分配给另外两个NodeManager，另两个NodeManager分别领取任务并创建container。，MapreduceApplicationMaster向两个接收到任务的NodeManager发送程序启动脚本，这两个NodeManager分别启动maptask，maptask对数据分区排序
MapreduceApplicationMaster等待所有maptask运⾏完毕后，向resourcemanager申请container，运⾏reducetask，reducetask向maptask获取相应分区的数据，程序运⾏完毕后，MapreduceApplicationMaster会向resourcemanager申请注销⾃⼰