Nutch 研究<三> 将Nutch爬取结果放入Hypertable

想把Nutch抓取的web page结果放入到Hypertable中去,目前思路主要有三个:

1. 修改Nutch源代码,让Nutch基于Hypertable工作,可以参考Hbase的实现. 由于该实现缺失Nutch好多特性,而且不易升级,考虑作罢.

2. 将Nutch抓取结果以命令导出为text的dump文件,然后用MapReduce解析该文件,哪相关信息到Hypertable.

3. 其实和第一一样,只不过是直接使用人家已经改好的基于Hbase的实现,然后导出一份tsv文件导入到Hypertable. 不仅融合了第一的缺点还增加了麻烦. 不考虑.

好,以下代码基于第二种思想实现.


package nutchdump;

import java.io.IOException;
import java.sql.Timestamp;
import java.util.Iterator;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Tool;
import org.apache.thrift.TException;
import org.apache.thrift.transport.TTransportException;
import org.hypertable.thrift.ThriftClient;
import org.hypertable.thriftgen.Cell;
import org.hypertable.thriftgen.ClientException;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.util.Tool;


/**
* NutchDumpReader
*
*Reads the dump entries from nutch dump command output, get each line result to
*write into hypertable database as special format
*由于只保存抓取的网页内容,所以只关心Nutch导出的文件中,Content::这一块的相关信息
*
* @author(lovejuan1314)
*/

public class NutchDumpReader extends Configured implements Tool{

// where to put the data in hdfs when we're done
private static final String OUTPUT_PATH = "nutch_content_result";

// where to read the data from.
private static final String INPUT_PATH = "/shared/nutch/segdump";

static class NutchReaderMapper extends MapReduceBase
implements Mapper<LongWritable, Text, Text, Text> {

public NutchReaderMapper() { }

public void map(LongWritable key, Text value,
OutputCollector<Text, Text> output, Reporter reporter)
throws IOException {
String dumpline = value.toString();
NutchDumpRecord nutchDumpRecord = new NutchDumpRecord(dumpline);
String version = nutchDumpRecord.getVersion();
if (version != null){
output.collect(new Text("version"), new Text(version));
}
String base = nutchDumpRecord.getBase();
if (base != null){
output.collect(new Text("base"), new Text(base));
}
String ContentType = nutchDumpRecord.getContentType();
if (ContentType != null){
output.collect(new Text("ContentType"), new Text(ContentType));
}
String metadata = nutchDumpRecord.getMetadata();
if (metadata != null){
output.collect(new Text("metadata"), new Text(metadata));
}
String url = nutchDumpRecord.getUrl();
if (url != null){
output.collect(new Text("url"), new Text(url));
}

String content = nutchDumpRecord.getContent();
if (content != null){
output.collect(new Text("content"), new Text(content));
}

}

}

static class NutchReaderReducer extends MapReduceBase
implements Reducer<Text, Text, Text, NullWritable> {

public void reduce(Text key, Iterator<Text> values,
OutputCollector<Text, NullWritable> output, Reporter reporter)
throws IOException {
String valKey = key.toString();

while(values.hasNext()){
Text val = values.next();
if (val.toString() != null){
//write into hypertable
writeIntoTable(valKey,val.toString());
// output
output.collect(key, NullWritable.get());
}
}

}

}

/**
*
* @param colName
* @param colValue
*/

private static void writeIntoTable(String colName,String colValue){

try {

ThriftClient client = ThriftClient.create("192.168.0.40", 38080);
// mutator examples
long mutator = client.open_mutator("webDb", 0, 0);

Timestamp ts = new Timestamp(System.currentTimeMillis());

try {
Cell cell = new Cell();
String sysDt = ts.toString();
//设置行关键字 我使用了系统时间+反转URL的格式
cell.row_key = sysDt+" "+"com.mytest.www";
//列名
cell.column_family = colName;
//列值
cell.value = colValue.getBytes();
client.set_cell(mutator, cell);
}
finally {
client.close_mutator(mutator, true);
}

} catch (TTransportException e) {
e.printStackTrace();
} catch (TException e) {
e.printStackTrace();
}catch (ClientException ex){
ex.printStackTrace();
}

}

/** Driver for the actual MapReduce process */

private void runJob() throws IOException{
JobConf conf = new JobConf(getConf(),NutchDumpReader.class);

FileInputFormat.addInputPath(conf, new Path(INPUT_PATH));
FileOutputFormat.setOutputPath(conf, new Path(OUTPUT_PATH));

conf.setMapperClass(NutchReaderMapper.class);
conf.setReducerClass(NutchReaderReducer.class);

conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(NullWritable.class);

conf.setMapOutputValueClass(Text.class);

JobClient.runJob(conf);
}


public int run(String[] arg0) throws Exception {
runJob();
return 0;
}

public static void main(String [] args) throws Exception {
int ret = ToolRunner.run(new NutchDumpReader(), args);
System.exit(ret);
}

}






package nutchdump;


public class NutchDumpRecord {

// the actual line from dump file
private String record;

// the fileds on the line
private String version;
private String url;
private String base;
private String ContentType;
private String metadata;
private String content;


//public NutchDumpFileRecord

public NutchDumpRecord(final String record){
if (record == null){
this.record = "";
}else{
this.record = record;
}
this.parse();
}

protected void parse(){
int versionIdx = this.record.indexOf("Version:");
int urlIdx = this.record.indexOf("url:");
int baseIdx = this.record.indexOf("base:");
int contentTypeIdx = this.record.indexOf("contentType:");
int metadataIdx = this.record.indexOf("metadata");
int contentIdx = this.record.indexOf("Content:");

if (versionIdx != -1){
this.version = this.record.substring(versionIdx).trim();
}

if (urlIdx != -1){
this.url = this.record.substring(urlIdx).trim();
}
if (baseIdx != -1){
this.base = this.record.substring(baseIdx).trim();
}
if (contentTypeIdx != -1){
this.ContentType = this.record.substring(contentTypeIdx).trim();
}
if (metadataIdx != -1){
this.metadata = this.record.substring(metadataIdx).trim();
}
if (contentIdx != -1){
this.content = this.record.substring(contentIdx).trim();
}

}

// getters

/** Return the record */
public String getRecord(){
return this.record;
}

public String getVersion(){
return this.version;
}
public String getUrl(){
return this.url;
}
public String getBase(){
return this.base;
}
public String getContentType(){
return this.ContentType;
}
public String getMetadata(){
return this.metadata;
}
public String getContent(){
return this.content;
}
}





//这个类是Hypertable源码中提供的.

/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/

/**
* Copyright (C) 2008 Luke Lu (Zvents, Inc.)
*
* This file is distributed under the Apache Software License
* (http://www.apache.org/licenses/)
*/

package nutchdump;

import org.hypertable.thriftgen.*;

import org.apache.thrift.TException;
import org.apache.thrift.transport.TSocket;
import org.apache.thrift.transport.TFramedTransport;
import org.apache.thrift.transport.TTransportException;
import org.apache.thrift.protocol.TBinaryProtocol;
import org.apache.thrift.protocol.TProtocol;

public class ThriftClient extends HqlService.Client {
public ThriftClient(TProtocol protocol) { super(protocol); }

// Java only allow super as the first statement of constructor. It doesn't
// support multiple inheritance to use the base-from-member idiom either. So,
// we're resorting to a static factory method here.
public static ThriftClient
create(String host, int port, int timeout_ms, boolean do_open)
throws TTransportException, TException {
TFramedTransport transport = new TFramedTransport(
new TSocket(host, port, timeout_ms));
ThriftClient client = new ThriftClient(new TBinaryProtocol(transport));
client.transport = transport;

if (do_open)
client.open();

return client;
}

// Java doesn't support default argument values, which makes things
// unnecessarily verbose here
public static ThriftClient create(String host, int port)
throws TTransportException, TException {
return create(host, port, 30000, true);
}

public void open() throws TTransportException, TException {
transport.open();
do_close = true;
}

public void close() {
if (do_close) {
transport.close();
do_close = false;
}
}

private TFramedTransport transport;
private boolean do_close = false;
}




代码完成后直接打成jar包,在hadoop环境下运行就可以了.


Ps:仅供参考,如果大家有什么更好的方法,欢迎讨论. 另外代码里也没有严格控制数据的一致性,若要在产品上运行还得进一步修改.
Python网络爬虫与推荐算法新闻推荐平台:网络爬虫:通过Python实现新浪新闻的爬取,可爬取新闻页面上的标题、文本、图片、视频链接(保留排版) 推荐算法:权重衰减+标签推荐+区域推荐+热点推荐.zip项目工程资源经过严格测试可直接运行成功且功能正常的情况才上传,可轻松复刻,拿到资料包后可轻松复现出一样的项目,本人系统开发经验充足(全领域),有任何使用问题欢迎随时与我联系,我会及时为您解惑,提供帮助。 【资源内容】:包含完整源码+工程文件+说明(如有)等。答辩评审平均分达到96分,放心下载使用!可轻松复现,设计报告也可借鉴此项目,该资源内项目代码都经过测试运行成功,功能ok的情况下才上传的。 【提供帮助】:有任何使用问题欢迎随时与我联系,我会及时解答解惑,提供帮助 【附带帮助】:若还需要相关开发工具、学习资料等,我会提供帮助,提供资料,鼓励学习进步 【项目价值】:可用在相关项目设计中,皆可应用在项目、毕业设计、课程设计、期末/期中/大作业、工程实训、大创等学科竞赛比赛、初期项目立项、学习/练手等方面,可借鉴此优质项目实现复刻,设计报告也可借鉴此项目,也可基于此项目来扩展开发出更多功能 下载后请首先打开README文件(如有),项目工程可直接复现复刻,如果基础还行,也可在此程序基础上进行修改,以实现其它功能。供开源学习/技术交流/学习参考,勿用于商业用途。质量优质,放心下载使用。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值