MapReduce作业接受序列化文件的输入时,可通过配置job的输入文件格式实现,具体见代码:
package hadoop;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
//import org.apache.hadoop.mapred.Task.CombinerRunner;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileAsTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class WCseq {
public static class Map extends Mapper<Text, Text, Text, IntWritable>{
//private Writable Key = (Writable)ReflectionUtils.newInstance(reader.getKeyClass(), conf);
@Override //注意key为Text类型
protected void map(Text key, Text value, Context context)
throws IOException, InterruptedException {
System.out.println("key:" + key.toString() + " " + "value:" + value );
String [] input = value.toString().split(" ");
for(String s : input){
context.write(new Text(s), new IntWritable(1));
}
}
}
public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
private static Path inputPath = new Path("/user/root/in-seqf/seq1");
private static Path outputPath = new Path("out-seqf");
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
Job job = new Job(conf, "WCseq");
job.setJarByClass(WCseq.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//主要通过该设置实现
job.setInputFormatClass(SequenceFileAsTextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, inputPath);
FileSystem fs = FileSystem.get(conf);
fs.delete(outputPath, true);
FileOutputFormat.setOutputPath(job, outputPath);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
经测试发现,map作业接受key的值为行号,map程序接受的key、value如下:
key:0 value:hello world key:1 value:bye world key:2 value:hello hadoop key:3 value:bye hadoop key:4 value:hello world key:5 value:bye world key:6 value:hello hadoop key:7 value:bye hadoop key:8 value:hello world key:9 value:bye world key:10 value:hello hadoop key:11 value:bye hadoop key:12 value:hello world