实现按行而不是按文件大小来切分的文件的方法

  NLineInputFormat继承自FileInputFormat,它实现按行而不是按文件大小来切分的文件的方法。

        重写了FileInputFormat中的getSplits()createRecordReader()方法,因为NLineInputFormat是在旧的mapreduce框架下写的,这里写了新框架下的NLineIputFormat,代码如下:

package com.yuankang.hadoop;

 

import java.io.IOException;

import java.util.ArrayList;

import java.util.List;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FSDataInputStream;

import org.apache.hadoop.fs.FileStatus;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import org.apache.hadoop.mapreduce.InputSplit;

import org.apache.hadoop.mapreduce.JobContext;

import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;

import org.apache.hadoop.mapreduce.RecordReader;

import org.apache.hadoop.util.LineReader;

import org.apache.hadoop.mapreduce.TaskAttemptContext;

 

public class NLineInputFormat extends FileInputFormat<LongWritable, Text>{ 

  private int N = 1;

  

  @Override

  public RecordReader<LongWritable, Text> 

    createRecordReader(InputSplit split,

                       TaskAttemptContext context) {

        return new LineRecordReader();

  }

 

  /** 

   * Logically splits the set of input files for the job, splits N lines

   * of the input as one split.

   * 

   * @see org.apache.hadoop.mapred.FileInputFormat#getSplits(JobConf, int)

   */

  public List<InputSplit> getSplits(JobContext job

  ) throws IOException{

    List<InputSplit> splits = new ArrayList<InputSplit>();

    for (FileStatus file : listStatus(job)) {

      Path path = file.getPath();

      FileSystem fs = path.getFileSystem(job.getConfiguration());

      

      LineReader lr = null;

      try {

        FSDataInputStream in  = fs.open(path);

        Configuration conf = job.getConfiguration();

        lr = new LineReader(in, conf);

        N = conf.getInt("mapred.line.input.format.linespermap", 1);

        Text line = new Text();

        int numLines = 0;

        long begin = 0;

        long length = 0;

        int num = -1;

        while ((num = lr.readLine(line)) > 0) {

          numLines++;

          length += num;

          if (numLines == N) {

            splits.add(new FileSplit(path, begin, length, new String[]{}));

            begin += length;

            length = 0;

            numLines = 0;

          }

        }

        if (numLines != 0) {

          splits.add(new FileSplit(path, begin, length, new String[]{}));

        }

   

      } finally {

        if (lr != null) {

          lr.close();

        }

      }

    }

    System.out.println("Total # of splits: " + splits.size());

    return splits;

  }

 

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值