package org.apache.nutch;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class SequenceFileReader<K, V> extends Configured implements Tool{
private boolean more = true;
private K key = null;
private V value = null;
private SequenceFile.Reader in;
public boolean nextKeyValue() throws IOException, InterruptedException {
if (!more) {
return false;
}
long pos = in.getPosition();
key = (K) in.next(key);
if (key == null ) {
more = false;
key = null;
value = null;
} else {
value = (V) in.getCurrentValue(value);
}
return more;
}
public K getCurrentKey() {
return key;
}
public V getCurrentValue() {
return value;
}
@Override
public int run(String[] arg0) throws Exception {
Configuration conf = this.getConf();
in = new SequenceFile.Reader(FileSystem.get(conf),new Path(arg0[0]),conf);
DataOutputBuffer outBuf = new DataOutputBuffer();
while(this.nextKeyValue()){
System.out.println(this.getCurrentKey());
System.out.println(this.getCurrentValue());
}
return 0;
}
public static void main(String[] args) {
try{
String file = "D:/serverpkg/asd/nutch-1.3/index/segments/20110902115211/parse_text/part-00000/data";
if(null == args || args.length == 0)
args = new String[]{file};
int res = ToolRunner.run(new Configuration(), new SequenceFileReader<Text,Writable>(), args);
System.exit(res);
}catch(Exception e){
e.printStackTrace();
}
}
}
Nutch: 读取 nutch抓取内容
最新推荐文章于 2021-11-24 20:14:43 发布