hadoop 自定义inputformat和outputformat

最新推荐文章于 2022-07-09 20:19:58 发布

blackproof

最新推荐文章于 2022-07-09 20:19:58 发布

阅读量101

点赞数

分类专栏： hadoop 文章标签： hadoop inputformat outputformat

本文链接：https://blog.csdn.net/blackproof/article/details/88302010

版权

hadoop 专栏收录该内容

43 篇文章 0 订阅

订阅专栏

hadoop的inputformat和outputformat

最好的例子vertica ：虽然是在pig中实现的udf，但是就是hadoop的inputformat和outputformat，在hive里也可以照用，贴个下载的地址：http://blackproof.iteye.com/blog/1791995

再贴一个项目中，在实现hadoop join时，用的inputformat和outputformat的简单实例：

hadoop join在http://blackproof.iteye.com/blog/1757530

自定义inputformat（泛型是maper的input）

public class MyInputFormat extends FileInputFormat<MultiKey,Employee> {
	
	public MyInputFormat(){}

	@Override
	public RecordReader<MultiKey, Employee> createRecordReader(
			InputSplit split, TaskAttemptContext context) throws IOException,
			InterruptedException {
		// TODO Auto-generated method stub
		return new MyRecordReader();
	}
	
	public static class MyRecordReader extends RecordReader<MultiKey, Employee>{

		public LineReader in;
		public MultiKey key;
		public Employee value;
		public StringTokenizer token = null;
		
		public Text line;
		
		@Override
		public void initialize(InputSplit split, TaskAttemptContext context)
				throws IOException, InterruptedException {
			// TODO Auto-generated method stub
			FileSplit fileSplit = (FileSplit)split;
			Configuration job = context.getConfiguration();
			Path file = fileSplit.getPath();
			FileSystem fs = file.getFileSystem(job);
			
			FSDataInputStream filein = fs.open(file);
			in = new LineReader(filein, job);
			
			key = new MultiKey();
			value = new Employee();
			line = new Text();
		}

		@Override
		public boolean nextKeyValue() throws IOException, InterruptedException {

			int linesize = in.readLine(line);
			if(linesize==0)
				return false;
			String[] pieces = line.toString().split(",");
			int i = Integer.valueOf(pieces[0]);
			switch (i) {
			case 1:
				value.setEmpName(pieces[1]);
				value.setFlag(1);
				break;

			default:
				value.setDepartName(pieces[1]);
				value.setFlag(2);
				break;
			}
			value.setDepartId(pieces[2]);
			value.setDepartNo(pieces[3]);
			
			key.setDepartId(value.getDepartId());
			key.setDepartNo(value.getDepartNo());
			return true;
		}

		@Override
		public MultiKey getCurrentKey() throws IOException,
				InterruptedException {
			// TODO Auto-generated method stub
			return key;
		}

		@Override
		public Employee getCurrentValue() throws IOException,
				InterruptedException {
			// TODO Auto-generated method stub
			return value;
		}

		@Override
		public float getProgress() throws IOException, InterruptedException {
			// TODO Auto-generated method stub
			return 0;
		}

		@Override
		public void close() throws IOException {
			// TODO Auto-generated method stub
			
		}
		
	}

}

自定义outputformat（泛型是reduce的输出）

public class MyOutputFormat extends FileOutputFormat<Text, Employee> {

	@Override
	public RecordWriter<Text, Employee> getRecordWriter(
			TaskAttemptContext job) throws IOException, InterruptedException {
		// TODO Auto-generated method stub
		Configuration conf = job.getConfiguration();
		Path file = getDefaultWorkFile(job, "");
		FileSystem fs = file.getFileSystem(conf);
		FSDataOutputStream fileOut = fs.create(file, false);
		return new MyRecordWriter(fileOut);
	}
	
	public static class MyRecordWriter extends RecordWriter<Text, Employee>{

		protected DataOutputStream out;
		private final byte[] keyValueSeparator;
		 public static final String NEW_LINE = System.getProperty("line.separator");
		
		public MyRecordWriter(DataOutputStream out){
			this(out,":");
		}
		
		public MyRecordWriter(DataOutputStream out,String keyValueSeparator){
			this.out = out;
			this.keyValueSeparator = keyValueSeparator.getBytes();
		}
		
		@Override
		public void write(Text key, Employee value) throws IOException,
				InterruptedException {
			if(key!=null){
				out.write(key.toString().getBytes());
				out.write(keyValueSeparator);
			}
			out.write(value.toString().getBytes());
			out.write(NEW_LINE.getBytes());
		}

		@Override
		public void close(TaskAttemptContext context) throws IOException,
				InterruptedException {
			out.close();
		}
		
	}

}