默认情况下在对输入文件进行拆分时,会按block块的大小分成多个InputSplit,InputSplit的数量取决于block的大小。每
个map进程处理一个InputSplit,InputSplit中有多少行记录就会调用多少次map函数。
如果使用NlineInputFormat,代表每个map进程处理的InputSplit不再按block块去划分,而是按NlineInputFormat指定的
行数N来划分。即,每个InputSplit中只有N行记录数。同样InputSplit中有多少行记录就会调用多少次map函数。
代码示例:
package com.bigdata.hadoop.mapred;
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MyNLineInputFormatApp {
private static final String INPUT_PATH = "hdfs://hadoop1:9000/dir1/hello";
private static final String OUTPUT_PATH = "hdfs://hadoop1:9000/dir1/out";
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
//方式一 设置每个InputSplit中划分三条记录
configuration.setInt("mapreduce.input.lineinputformat.linespermap", 3);
Job job = new Job(configuration,MyNLineInputFormatApp.class.getSimpleName());
//方式二 设置每个InputSplit中划分三条记录
// NLineInputFormat.setNumLinesPerSplit(job, 3);
final FileSystem fileSystem = FileSystem.get(new URI(OUTPUT_PATH), configuration);
fileSystem.delete(new Path(OUTPUT_PATH),true);
//使用NLineInputFormat处理记录数
job.setInputFormatClass(NLineInputFormat.class);
job.setJarByClass(MyNLineInputFormatApp.class);
FileInputFormat.setInputPaths(job, INPUT_PATH);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH));
job.waitForCompletion(true);
}
public static class MyMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
@Override
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
final String line = value.toString();
final String[] splited = line.split("\t");
for (int i = 0; i < splited.length; i++) {
context.write(new Text(splited[i]), new LongWritable(1));
}
}
}
public static class MyReducer extends Reducer<Text, LongWritable, Text, LongWritable>{
@Override
protected void reduce(Text key, Iterable<LongWritable> values,
Reducer<Text, LongWritable, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
long count = 0l;
for (LongWritable times : values) {
count += times.get();
}
context.write(key, new LongWritable(count));
}
}
}