在hadoop中,可以处理多种输入格式的文件。如下给出的例子是同时输入文本和二进制文件。
Mapper
public class WCSeqMapper extends Mapper<IntWritable, Text, Text, IntWritable> {
@Override
protected void map(IntWritable key, Text value, Context context) throws IOException, InterruptedException {
Text keyOut = new Text();
IntWritable valueOut = new IntWritable();
String[] arr = value.toString().split(" ");
for (String s : arr) {
keyOut.set(s);
valueOut.set(1);
context.write(keyOut, valueOut);
}
}
}
public class WCTextMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
Text keyOut = new Text();
IntWritable valueOut = new IntWritable();
String[] arr = value.toString().split(" ");
for (String s : arr) {
keyOut.set(s);
valueOut.set(1);
context.write(keyOut, valueOut);
}
}
}
Reducer
public class WCReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
/**
* reduce
*/
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0;
for (IntWritable iw : values) {
count = count + iw.get();
}
//输出
context.write(key, new IntWritable(count));
}
}
主函数
/**
* 多输入问题
* > 文本格式输入
* > 二进制格式文件输入
*
*/
public class WCAppMulti {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "file:///");
Job job = Job.getInstance(conf);
//设置job的各种属性
job.setJobName("WCAppMulti"); //作业名称
job.setJarByClass(WCAppMulti.class); //搜索类
job.setInputFormatClass(TextInputFormat.class); //设置输入格式
//多个输入
MultipleInputs.addInputPath(job, new Path("file:///F:/hadoop/mr/txt"), TextInputFormat.class, WCTextMapper.class);
MultipleInputs.addInputPath(job, new Path("file:///F:/hadoop/mr/seq"), SequenceFileInputFormat.class, WCSeqMapper.class);
//设置输出
FileOutputFormat.setOutputPath(job, new Path(args[0]));
//设置最大切片数
FileInputFormat.setMaxInputSplitSize(job, 10);
//设置最小切片数
FileInputFormat.setMinInputSplitSize(job, 1);
job.setReducerClass(WCReducer.class); //reducer类
job.setNumReduceTasks(3); //reduce个数
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.waitForCompletion(true);
}
}