基于时间url的解析
2014-01-2517:00:00 www.baidu.com
2014-01-2517:00:00 www.google.com
现在需要解析得键值对为时间和URL的键值对。
首先构建键的类型CalendarWriteableComparable
class CalendarWriteableComparable implements WritableComparable<CalendarWriteableComparable>{
private Calendar calendar;
public CalendarWriteableComparable() {
super();
}
public CalendarWriteableComparable(Calendar calendar) {
super();
this.calendar = calendar;
}
public void set(String str) throws ParseException{
calendar =Calendar.getInstance();
calendar.setTime(new SimpleDateFormat("yyyy-mm-ddhh-mm-ss").parse(str));
}
@Override
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
out.writeUTF(calendar.getTime().toLocaleString());
}
@Override
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
try {
calendar.setTime(new SimpleDateFormat("yyyy-mm-ddhh-mm-ss").parse(in.readUTF()));
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
@Override
public int compareTo(CalendarWriteableComparable o) {
return this.calendar.after(o.calendar)?0:1;
}
}
然后构建值得对应的对象
class UrlWriteable implements Writable {
public URL url;
public UrlWriteable() {
}
public UrlWriteable(URL url) {
super();
this.url = url;
}
@Override
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
out.writeUTF(url.toString());
}
public void setUrl(String str) {
try {
url = new URL(str);
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
@Override
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
url = new URL(in.readUTF());
}
}
由于键会参与排序和去重所以必须显示comparable接口
format方法直接继承fileinputformat
public class TimeUrlInputformat extends FileInputFormat<Text, UrlWriteable> {
@Override
public RecordReader<Text, UrlWriteable> getRecordReader(InputSplit split,
JobConf job, Reporter reporter) throws IOException {
// TODO Auto-generated method stub
return new TimeUrlLineRecordReader(job,(FileSplit)split);
}
}
同时我们需要些一个读取转换的类去实现RecordReader接口
public TimeUrlLineRecordReader(JobConf job,FileSplit fileSplit) {
try {
lineRecordReader = new KeyValueLineRecordReader(job, fileSplit);
linekey = lineRecordReader.createKey();
linevalue = lineRecordReader.createValue();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
@Override
public boolean next(Text key, UrlWriteable value) throws IOException {
// TODO Auto-generated method stub
if(!lineRecordReader.next(linekey,linevalue)){
return false;
}
key.set(linekey);
value.setUrl(linevalue.toString());
return true;
}
@Override
public long getPos() throws IOException {
// TODO Auto-generated method stub
return lineRecordReader.getPos();
}
@Override
public void close() throws IOException {
// TODO Auto-generated method stub
lineRecordReader.close();
}
@Override
public float getProgress() throws IOException {
// TODO Auto-generated method stub
return lineRecordReader.getProgress();
}
@Override
public Text createKey() {
// TODO Auto-generated method stub
return new Text("");
}
@Override
public UrlWriteable createValue() {
// TODO Auto-generated method stub
return new UrlWriteable();
}
}
主要参照其next方法借用KeyValueLineRecordReader,将原本为Text的兼职对转换成我们需要的calendar和url类型。这样我们就定义了一个自己的输入格式