- package org.apache.nutch.util;
- import java.io.IOException;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.conf.Configured;
- import org.apache.hadoop.fs.FileSystem;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.DataOutputBuffer;
- import org.apache.hadoop.io.SequenceFile;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.io.Writable;
- import org.apache.hadoop.util.Tool;
- import org.apache.hadoop.util.ToolRunner;
- /**
- * 读取SequenceFile的信息
- */
- public class SequenceFileReader<K,V> extends Configured implements Tool{
- private boolean more = true;
- private K key = null;
- private V value = null;
- private SequenceFile.Reader in;
- public boolean nextKeyValue() throws IOException, InterruptedException {
- if (!more) {
- return false;
- }
- long pos = in.getPosition();
- key = (K) in.next(key);
- if (key == null ) {
- more = false;
- key = null;
- value = null;
- } else {
- value = (V) in.getCurrentValue(value);
- }
- return more;
- }
- public K getCurrentKey() {
- return key;
- }
- public V getCurrentValue() {
- return value;
- }
- @Override
- public int run(String[] arg0) throws Exception {
- Configuration conf = this.getConf();
- in = new SequenceFile.Reader(FileSystem.get(conf),new Path(arg0[0]),conf);
- DataOutputBuffer outBuf = new DataOutputBuffer();
- while(this.nextKeyValue()){
- System.out.println(this.getCurrentKey());
- System.out.println(this.getCurrentValue());
- }
- return 0;
- }
- public static void main(String[] args) {
- try{
- String file = "C:/nutch-1.0/crawled/segments/20100624073431/content/part-00000/data";
- if(null == args || args.length == 0)
- args = new String[]{file};
- int res = ToolRunner.run(new Configuration(), new SequenceFileReader<Text,Writable>(), args);
- System.exit(res);
- }catch(Exception e){
- e.printStackTrace();
- }
- }
- }
读取nutch爬取的数据内容
最新推荐文章于 2020-05-27 22:39:59 发布