[实验]avro与non-avro的mapred例子-wordcount改写

最新推荐文章于 2024-11-05 11:51:44 发布

iteye_20264

最新推荐文章于 2024-11-05 11:51:44 发布

阅读量165

点赞数

分类专栏： hadoop 文章标签： hadoop avro

本文链接：https://blog.csdn.net/iteye_20264/article/details/82538137

版权

hadoop 专栏收录该内容

13 篇文章 0 订阅

订阅专栏

avro非常适合用于hadoop。在开发的时候可能有这样的场景，输入的文件是non-avro的，输出的文件是avro的。这样就需要一个是非avro的mapper和一个avro的reducer。下面通过改写wordcount例子演示这个过程。
[b]Mapper[/b]


public class WordCountMapper extends MapReduceBase implements
		Mapper<LongWritable, Text, AvroKey<CharSequence>, AvroValue<Integer>> {
	private Text word = new Text();
	private static final AvroValue<Integer> one = new AvroValue<Integer>(1);

	public void map(LongWritable key, Text value,
			OutputCollector<AvroKey<CharSequence>, AvroValue<Integer>> output,
			Reporter reporter) throws IOException {
		String line = value.toString();
		StringTokenizer tokenizer = new StringTokenizer(line);
		while (tokenizer.hasMoreTokens()) {
			word.set(tokenizer.nextToken());
			output.collect(new AvroKey<CharSequence>(word.toString()), one);
		}
	}
}

[b]Reducer[/b]


public class WordCountReducer extends
		AvroReducer<CharSequence, Integer, Pair<CharSequence, Integer>> {

	@Override
	public void reduce(CharSequence key, Iterable<Integer> values,
			AvroCollector<Pair<CharSequence, Integer>> collector,
			Reporter reporter) throws IOException {
		int sum = 0;
		for (Integer cnt : values) {
			sum += cnt;
		}
		collector.collect(new Pair<CharSequence, Integer>(key.toString(), sum));
	}

}

[b]Driver[/b]


public class AvroWordCount extends Configured implements Tool {

	public static void main(String[] args) throws Exception {
		int exitCode = ToolRunner.run(new AvroWordCount(), args);
		System.exit(exitCode);
	}

	@Override
	public int run(String[] args) throws Exception {
		if(args.length != 2){
			System.out.printf("Usage %s [generic options] <in> <out>\n", getClass().getName());
			ToolRunner.printGenericCommandUsage(System.out);
			return -1;
		}
		JobConf conf = new JobConf(AvroWordCount.class);
	    conf.setJobName("wordcount");
	    conf.set("fs.default.name", "hdfs://node04vm01:9000");

	    AvroJob.setOutputSchema(conf, Pair.getPairSchema(Schema.create(Type.STRING),
	        Schema.create(Type.INT)));
		conf.setMapperClass(WordCountMapper.class);
		AvroJob.setReducerClass(conf, WordCountReducer.class);
	    conf.setInputFormat(TextInputFormat.class);

		FileInputFormat.setInputPaths(conf, new Path(args[0]));
	    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

	    JobClient.runJob(conf);
	    return 0;
	}
}