十八、NLineInputFormat实操与源码分析

最新推荐文章于 2024-03-17 20:13:26 发布

风zi

最新推荐文章于 2024-03-17 20:13:26 发布

阅读量426

点赞数

分类专栏： hadoop

本文链接：https://blog.csdn.net/qq_35241080/article/details/106044849

版权

hadoop 专栏收录该内容

35 篇文章 3 订阅

订阅专栏

一、介绍

NLineInputFormat是FileInputFormat的实现类，与kv相反的是Nline的切片机制重定义了。

NlineInputFormat切片规则：
如果使用NlineInputFormat，代表每个map进程处理的InputSplit不再按Block块去划分，而是按NlineInputFormat指定的行数N来划分。即输入文件的总行数/N=切片数，如果不整除，切片数=商+1。

示例描述：

以下是一个示例，仍然以上面的4行输入为例。

Rich learning form
Intelligent learning engine
Learning more convenient
From the real demand for more close to the enterprise

例如，如果N是2，则每个输入分片包含两行。开启2个MapTask。
(0,Rich learning form)
(19,Intelligent learning engine)

另一个 mapper 则收到后两行：
(47,Learning more convenient)
(72,From the real demand for more close to the enterprise)

这里的键和值与TextInputFormat生成的一样。

二、代码操作

需求：解析如下数据分为了几个maptask

1java	java	hello	scala	java1
2baidu	alib	meituan	scala	baidu1
3alib	java	scala	java	alib1
4baidu	alib	meituan	scala	baidu1
5alib	java	scala	java	alib1
6baidu	alib	meituan	scala	baidu1
7alib	java	scala	java	alib1
8alib	java	scala	java	alib1
9baidu	alib	meituan	scala	baidu1
10alib	java	scala	java	alib

1、mapper

package com.cjy.mr.nline;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class NLineMapper extends Mapper<LongWritable, Text, Text, IntWritable>{

    // 输出数据格式，也就是reduce接受的格式
    Text k = new Text();
    IntWritable v = new IntWritable(1);

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        //获取当前行文本
        String line = value.toString();

        //按照 \t 分割字符
        String[] words = line.split("\t");

        //循环将每个字符写出，reduce会按照相同key的数据聚集到一起
        for (String word : words) {
            k.set(word);
            //word  1
            //java  1
            context.write(k,v);
        }
    }
}

2、reducer

package com.cjy.mr.nline;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class NLineReducer extends Reducer<Text, IntWritable, Text, IntWritable>{


    int sum ;
    IntWritable  v = new IntWritable();
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

        // 1 累加求和
        sum = 0;
        for (IntWritable count : values) {
            sum += count.get();
        }

        // 2 输出
        v.set(sum);
        //java  5
        //hello 4
        context.write(key,v);
    }

}

3、driver

package com.cjy.mr.nline;

import java.io.IOException;
import java.net.URISyntaxException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class NLineDriver {

    public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {

// 输入输出路径需要根据自己电脑上实际的输入输出路径设置
        args = new String[] { "/Users/chenjunying/Downloads/wd.txt", "/Users/chenjunying/Downloads/out/" };

        // 1 获取job对象
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);

        // 7设置每个切片InputSplit中划分三条记录
        NLineInputFormat.setNumLinesPerSplit(job, 4);

        // 8使用NLineInputFormat处理记录数
        job.setInputFormatClass(NLineInputFormat.class);

        // 2设置jar包位置，关联mapper和reducer
        job.setJarByClass(NLineDriver.class);
        job.setMapperClass(NLineMapper.class);
        job.setReducerClass(NLineReducer.class);

        // 3设置map输出kv类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        // 4设置最终输出kv类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        // 5设置输入输出数据路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        // 6提交job
        job.waitForCompletion(true);
    }
}

4、本地运行结果，如下可以看出分了三个片

2020-05-11 00:30:56,633 INFO [org.apache.hadoop.mapreduce.lib.input.FileInputFormat] - Total input paths to process : 1
2020-05-11 00:36:31,915 INFO [org.apache.hadoop.mapreduce.JobSubmitter] - number of splits:3
2020-05-11 00:36:32,029 INFO [org.apache.hadoop.mapreduce.JobSubmitter] - Submitting tokens for job: job_local850782342_0001

三、源码阅读

//1.分片行数变量
    public static final String LINES_PER_MAP = "mapreduce.input.lineinputformat.linespermap";


//2.重写了分片函数，定义分片规则，但是实际不是这个函数执行的，是由getSplitsForFile来确定分片规则
public List<InputSplit> getSplits(JobContext job) throws IOException {
   List<InputSplit> splits = new ArrayList();
   int numLinesPerSplit = getNumLinesPerSplit(job);
   Iterator var4 = this.listStatus(job).iterator();

   while(var4.hasNext()) {
       FileStatus status = (FileStatus)var4.next();
       splits.addAll(getSplitsForFile(status, job.getConfiguration(), numLinesPerSplit));
   }

   return splits;
}

 
//3.实际分片规则定义函数
 public static List<FileSplit> getSplitsForFile(FileStatus status, Configuration conf, int numLinesPerSplit) throws IOException {
        //4.分片信息集合，几个元素就是分了几片
        List<FileSplit> splits = new ArrayList();
        //5.获取文件全路径
        Path fileName = status.getPath();

        //6.判断是否是文件夹
        if (status.isDirectory()) {
            throw new IOException("Not a file: " + fileName);
        } else {
            FileSystem fs = fileName.getFileSystem(conf);
            LineReader lr = null;

            try {
                //7. 获取数据输入流
                FSDataInputStream in = fs.open(fileName);

                lr = new LineReader(in, conf);
                Text line = new Text();
                //8.读取行数记录
                int numLines = 0; //当前片内读了第几行数据
                long begin = 0L;  
                long length = 0L;  //记录每片数据大小
                boolean var14 = true;

                int num;
                //9.读取数据，一行一行读取
                while((num = lr.readLine(line)) > 0) {
                    //10.读取一行数据
                    ++numLines;
                    //11.读取数据长度，也就是当前片内数据长度
                    length += (long)num;
                    //12.判断是否到达分片限制
                    if (numLines == numLinesPerSplit倍数时，最后一片数据) {
                        //13.当满足分片条件时，进行分片
                        splits.add(createFileSplit(fileName, begin, length));
                        //14.重制分片信息
                        begin += length;
                        length = 0L;
                        numLines = 0;
                    }
                }

                //15. 总行数不是numLinesPerSplit倍数时，最后一片数据
                if (numLines != 0) {
                    splits.add(createFileSplit(fileName, begin, length));
                }
            } finally {
                if (lr != null) {
                    lr.close();
                }

            }

            return splits;
        }
    }

//16.按照起止位置读取一个切片的数据
protected static FileSplit createFileSplit(Path fileName, long begin, long length) {
    return begin == 0L ? new FileSplit(fileName, begin, length - 1L, new String[0]) : new FileSplit(fileName, begin - 1L, length, new String[0]);
}