(3)下面实现一个自己的InputFormat,需要处理的数据为(时间:URL)
public class TimeUrlTextInputInputFormat extends FileInputFormat<Text,URLWritable>{
public RecordReader<Text,URLWritable> getRecordReader(
InputSplit input,JobConf job,Reporter reporter)throws IOException{
return new TimeUrlLineRecorderReader(job,(FileSplit)input);
} }
pulbic class URLWritable implements Writable{
protected URL url;
public URLWritable(){}
public URLWritable(URL url){
This.url=url;
}
public void write(DataOutput out) throws IOException{
Out.writeUTF(url.toString());
}
public void readFields(DataInput in) throws IOException{
url=new URL(in.readUTF());
}
public void set(String s)throws MalformedURLException{
Url=new URL(s);
}
}
class TimeUrlLineRecordReader implements RecordReader<Text,URLWritable>{
private KeyValueLineRecorderReader lineReader;
private Text lineKey,lineValue;
public TimeUrlLineRecordReader(JobConf job,FileSplit split) throws IOException{
lineRecorder=new KeyValueLineRecordReader(job,split);
lineKey=lineReader.createKey();
lineValue=lineReader.createValue();
}
public boolean next(Text key,URLWritable value) throws IOException{
if(!lineReader.next(lineKey,lineValue)){
Return false;
}
key.set(lineKey);
Value.set(lineValue.toString());
return true;
}
public Text createKey(){
Return new Text("");
}
public URLWritable createValue(){
return new URLWritable();
}
public long getPos() throws IOException{
Return lineRecorder.getPos();
}
public float getProgress() throws IOException{
Return lineReader.getProgress();
}
public void close() throws IOException{
lineReader.close();
}
}
七、输出格式 outputFormat
hadoop中实现了OutputFormat接口的类有如下几个
TextOutputFormat<K,V>:用tab键分隔输出,可以通过mapred.textoutputformat.separator
属性进行更换。
SequenceFileOutputFormat<K,V>:和SequeceFileOutputFormat搭配使用
NullOutputFormat<K,V>:什么都不输出
<!--EndFragment-->