Nutch 研究<三> 将Nutch爬取结果放入Hypertable

最新推荐文章于 2023-09-20 21:43:04 发布

lovejuan1314

最新推荐文章于 2023-09-20 21:43:04 发布

阅读量116

点赞数

分类专栏： Hadoop/Hypertable/Nutch系列文章标签： Hadoop Apache HBase Mapreduce SQL

Hadoop/Hypertable/Nutch系列专栏收录该内容

12 篇文章 0 订阅

订阅专栏

想把Nutch抓取的web page结果放入到Hypertable中去,目前思路主要有三个:

1. 修改Nutch源代码,让Nutch基于Hypertable工作,可以参考Hbase的实现. 由于该实现缺失Nutch好多特性,而且不易升级,考虑作罢.

2. 将Nutch抓取结果以命令导出为text的dump文件,然后用MapReduce解析该文件,哪相关信息到Hypertable.

3. 其实和第一一样,只不过是直接使用人家已经改好的基于Hbase的实现,然后导出一份tsv文件导入到Hypertable. 不仅融合了第一的缺点还增加了麻烦. 不考虑.

好,以下代码基于第二种思想实现.


package nutchdump;

import java.io.IOException;
import java.sql.Timestamp;
import java.util.Iterator;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Tool;
import org.apache.thrift.TException;
import org.apache.thrift.transport.TTransportException;
import org.hypertable.thrift.ThriftClient;
import org.hypertable.thriftgen.Cell;
import org.hypertable.thriftgen.ClientException;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.util.Tool;


/**
 * NutchDumpReader
 *
 *Reads the dump entries from nutch dump command output, get each line result to
 *write into hypertable database as special format
 *由于只保存抓取的网页内容,所以只关心Nutch导出的文件中,Content::这一块的相关信息
 *
 * @author(lovejuan1314)
 */

public class NutchDumpReader extends Configured implements Tool{

	  // where to put the data in hdfs when we're done
	  private static final String OUTPUT_PATH = "nutch_content_result";

	  // where to read the data from.
	  private static final String INPUT_PATH = "/shared/nutch/segdump";

	  static class NutchReaderMapper extends MapReduceBase
      implements Mapper<LongWritable, Text, Text, Text> {

		public NutchReaderMapper() { }  

		public void map(LongWritable key, Text value,
				OutputCollector<Text, Text> output, Reporter reporter)
				throws IOException {
			String dumpline = value.toString();
			NutchDumpRecord nutchDumpRecord = new NutchDumpRecord(dumpline);
			String version = nutchDumpRecord.getVersion();
			if (version != null){
				output.collect(new Text("version"), new Text(version));
			}
			String base = nutchDumpRecord.getBase();
			if (base != null){
				output.collect(new Text("base"), new Text(base));
			}
			String ContentType = nutchDumpRecord.getContentType();
			if (ContentType != null){
				output.collect(new Text("ContentType"), new Text(ContentType));
			}
			String metadata = nutchDumpRecord.getMetadata();
			if (metadata != null){
				output.collect(new Text("metadata"), new Text(metadata));
			}
			String url = nutchDumpRecord.getUrl();
			if (url != null){
				output.collect(new Text("url"), new Text(url));
			} 

			String content = nutchDumpRecord.getContent();
			if (content != null){
				output.collect(new Text("content"), new Text(content));
			}

		}

	  }

	  static class NutchReaderReducer extends MapReduceBase
      implements Reducer<Text, Text, Text, NullWritable> {

		public void reduce(Text key, Iterator<Text> values,
				OutputCollector<Text, NullWritable> output, Reporter reporter)
				throws IOException {
			String valKey = key.toString();

			while(values.hasNext()){
				Text val = values.next();
				if (val.toString() != null){
					//write into hypertable
					writeIntoTable(valKey,val.toString());
					// output
					output.collect(key, NullWritable.get());
				}
			}

		}

	  }

	  /**
	   * 
	   * @param colName
	   * @param colValue
	   */

	  private static void writeIntoTable(String colName,String colValue){

		  try {

			ThriftClient client = ThriftClient.create("192.168.0.40", 38080);
			// mutator examples
		    long mutator = client.open_mutator("webDb", 0, 0);

		    Timestamp ts = new Timestamp(System.currentTimeMillis());

		      try {
		        Cell cell = new Cell();
		        String sysDt = ts.toString();
//设置行关键字 我使用了系统时间+反转URL的格式
		        cell.row_key = sysDt+" "+"com.mytest.www";
//列名
		        cell.column_family = colName;
//列值
		        cell.value = colValue.getBytes();
		        client.set_cell(mutator, cell);
		      }
		      finally {
		        client.close_mutator(mutator, true);
		      }

		} catch (TTransportException e) {
			e.printStackTrace();
		} catch (TException e) {
			e.printStackTrace();
		}catch (ClientException ex){
			ex.printStackTrace();
		}

	  }

	  /** Driver for the actual MapReduce process */

	  private void runJob() throws IOException{
		  JobConf conf = new JobConf(getConf(),NutchDumpReader.class);

		  FileInputFormat.addInputPath(conf, new Path(INPUT_PATH));
		  FileOutputFormat.setOutputPath(conf, new Path(OUTPUT_PATH));

		  conf.setMapperClass(NutchReaderMapper.class);
		  conf.setReducerClass(NutchReaderReducer.class);

		  conf.setOutputKeyClass(Text.class);
		  conf.setOutputValueClass(NullWritable.class);

		  conf.setMapOutputValueClass(Text.class);

		  JobClient.runJob(conf);
	  }


	public int run(String[] arg0) throws Exception {
		runJob();
		return 0;
	}

	 public static void main(String [] args) throws Exception {
		    int ret = ToolRunner.run(new NutchDumpReader(), args);
		    System.exit(ret);
		  }

}


package nutchdump;


public class NutchDumpRecord {

	// the actual line from dump file
	private String record;

	// the fileds on the line
	private String version;
	private String url;
	private String base;
	private String ContentType;
	private String metadata;
	private String content;


	//public NutchDumpFileRecord

	public NutchDumpRecord(final String record){
		if (record == null){
			this.record = "";
		}else{
			this.record = record;
		}
		this.parse();
	}

	protected void parse(){
		int versionIdx = this.record.indexOf("Version:");
		int urlIdx = this.record.indexOf("url:");
		int baseIdx = this.record.indexOf("base:");
		int contentTypeIdx = this.record.indexOf("contentType:");
		int metadataIdx = this.record.indexOf("metadata");
		int contentIdx = this.record.indexOf("Content:");

		if (versionIdx != -1){
			this.version = this.record.substring(versionIdx).trim();
		}

		if (urlIdx != -1){
			this.url = this.record.substring(urlIdx).trim();
		}
		if (baseIdx != -1){
			this.base = this.record.substring(baseIdx).trim();
		}
		if (contentTypeIdx != -1){
			this.ContentType = this.record.substring(contentTypeIdx).trim();
		}
		if (metadataIdx != -1){
			this.metadata = this.record.substring(metadataIdx).trim();
		}
		if (contentIdx != -1){
			this.content = this.record.substring(contentIdx).trim();
		}

	}

	// getters

	  /** Return the record */
	public String getRecord(){
		return this.record;
	}

	public String getVersion(){
		return this.version;
	}
	public String getUrl(){
		return this.url;
	}
	public String getBase(){
		return this.base;
	}
	public String getContentType(){
		return this.ContentType;
	}
	public String getMetadata(){
		return this.metadata;
	}
	public String getContent(){
		return this.content;
	}
}


//这个类是Hypertable源码中提供的. 

/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */

/**
 * Copyright (C) 2008  Luke Lu (Zvents, Inc.)
 *
 * This file is distributed under the Apache Software License
 * (http://www.apache.org/licenses/)
 */

package nutchdump;

import org.hypertable.thriftgen.*;

import org.apache.thrift.TException;
import org.apache.thrift.transport.TSocket;
import org.apache.thrift.transport.TFramedTransport;
import org.apache.thrift.transport.TTransportException;
import org.apache.thrift.protocol.TBinaryProtocol;
import org.apache.thrift.protocol.TProtocol;

public class ThriftClient extends HqlService.Client {
  public ThriftClient(TProtocol protocol) { super(protocol); }

  // Java only allow super as the first statement of constructor.  It doesn't
  // support multiple inheritance to use the base-from-member idiom either. So,
  // we're resorting to a static factory method here.
  public static ThriftClient
  create(String host, int port, int timeout_ms, boolean do_open)
      throws TTransportException, TException {
    TFramedTransport transport = new TFramedTransport(
        new TSocket(host, port, timeout_ms));
    ThriftClient client = new ThriftClient(new TBinaryProtocol(transport));
    client.transport = transport;

    if (do_open)
      client.open();

    return client;
  }

  // Java doesn't support default argument values, which makes things
  // unnecessarily verbose here
  public static ThriftClient create(String host, int port)
      throws TTransportException, TException {
    return create(host, port, 30000, true);
  }

  public void open() throws TTransportException, TException {
    transport.open();
    do_close = true;
  }

  public void close() {
    if (do_close) {
      transport.close();
      do_close = false;
    }
  }

  private TFramedTransport transport;
  private boolean do_close = false;
}