《搜索引擎零距离》IRVM 已爬地址处理

[b] 使用BDB来存储已爬地址,用int status的各个不同的位,来表示不同的访问状态:
访问过;保存过;修改过 等等[/b]

package com.rayeen.spider.vertical.data;

import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;

import org.apache.hadoop.io.MD5Hash;
import org.apache.log4j.Logger;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.Environment;
import com.sleepycat.persist.EntityStore;
import com.sleepycat.persist.PrimaryIndex;
import com.sleepycat.persist.SecondaryIndex;
import com.rayeen.spider.vertical.constant.MetResourceTag;
import com.rayeen.spider.vertical.constant.MsgConstant;
import com.rayeen.spider.vertical.util.ResutTree;
import org.apache.commons.lang.*;

public class MetResourceProtocolImpl implements MetResourceProtocol {


public MetResourceProtocolImpl(String name){
this.name=name;
}
String name;

static final Logger LOG = Logger.getLogger(ResutTree.class);

public static String openMode = "append";

public static int hitCnt = 0;

private static MyDbEnv myDbEnv = new MyDbEnv();

static EntityStore da;

static PrimaryIndex<String, MetResourceBE> infoMap = null;

static SecondaryIndex<String, String, MetResourceBE> infoMapByDigest = null;

static int flushCnt = 0;

//初始化BDB数据库环境
public synchronized void initilize() throws DatabaseException {

Environment myDbEnvironment = null;


File met = new File("./metResource");
if(!met.exists()){
met.mkdir();
}


File file = new File("./metResource/"+name);

try {
if (!file.exists()) {

if (!file.exists()) {
file.mkdir();
}

myDbEnv.close();
myDbEnv.setupNoTransact(file, false);

} else {//
try {
myDbEnv.setupAppend(file);
} catch (Exception e) {
LOG.error(e.getMessage());
file.delete();

file.mkdir();
myDbEnv.setupNoTransact(file, false);
}
}

myDbEnv.setCacheSize(1024);
} catch (DatabaseException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}

try {
da = myDbEnv.getEntityStore();
infoMap = da.getPrimaryIndex(String.class, MetResourceBE.class);
infoMapByDigest = da.getSecondaryIndex(infoMap, String.class,
"digest");

} catch (DatabaseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

}

public int error() throws IOException {
throw new IOException("bobo");
}

public int getMetResource(String uri, byte[] content) {

if(null==uri) return 0;

LOG.info("uri:" + uri);
try {
MetResourceBE metResource = infoMap.get(uri);

if (metResource != null) {
int status = metResource.getStatus();
String hash = "";
if (content == null) {
hash = MD5Hash.digest(uri).toString();
} else {
hash = MD5Hash.digest(content).toString();
}
int oldStatus = 0;
String oldHash = metResource.getDigest();
if (StringUtils.equalsIgnoreCase(oldHash, hash)) {
status ^= MetResourceBE.MODIFIED;// 去掉“修改过”标志位,未更改过
status |= MetResourceBE.UNMODIFIED;// 去掉“修改过”标志位,未更改过
} else {
status ^= MetResourceBE.UNMODIFIED;//未更改过
status |= MetResourceBE.MODIFIED;// 更改过
}

LOG.info("status:" + status);

return status;
}
} catch (DatabaseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return 0;
}

/**
*
* @param uri
* 地址
* @param content
* 内容
* @param status
* 当前状态(“经过但是不保存”还是“保存”)
* @return
* @throws DatabaseException
*/
public String putMetResource(String uri, byte[] content, int status,
MetResourceTag type) throws DatabaseException {

String hash = "";
//uri = uri.toLowerCase();
if (content == null) {
hash = MD5Hash.digest(uri).toString();
} else {
hash = MD5Hash.digest(content).toString();
}

MetResourceBE metResource = null;

//覆盖
if (type == MetResourceTag.COVER) {

metResource = infoMap.get(uri);
metResource = new MetResourceBE();
metResource.setUri(uri);
metResource.setDigest(hash);
metResource.setStatus(status);
infoMap.putNoReturn(metResource);
myDbEnv.sync();
return MsgConstant.SUCCESS;
}

int oldStatus = 0;
if (infoMap.contains(uri)) {
metResource = infoMap.get(uri);

oldStatus = metResource.getStatus();
String oldHash = metResource.getDigest();
if (StringUtils.equalsIgnoreCase(oldHash, hash)) {
oldStatus ^= MetResourceBE.MODIFIED;// 去掉“修改过”标志位,未更改过
oldStatus |= MetResourceBE.UNMODIFIED;// 去掉“修改过”标志位,未更改过
} else {
oldStatus ^= MetResourceBE.UNMODIFIED;// 去掉“未修改过”标志位,未更改过
oldStatus |= MetResourceBE.MODIFIED;// 更改过
}
oldStatus |= status;
metResource.setStatus(oldStatus);

} else {// 遇到过相同的页面

if (infoMapByDigest.contains(hash)) {
oldStatus = MetResourceBE.SAME_CONTENT;// 不同uri相同页面
} else {// 没有遇到过相同内容的页面
// 根据status来设置是met还是saved
oldStatus = status;
}

metResource = new MetResourceBE();

metResource.setUri(uri);
metResource.setDigest(hash);
metResource.setStatus(oldStatus);

}

if (null != metResource) {
infoMap.putNoReturn(metResource);

if(flushCnt++ % 5==0){
myDbEnv.sync();
}
}

return MsgConstant.SUCCESS;
}

public int getMetResource(String uri) {
// TODO Auto-generated method stub
return getMetResource(uri, null);
}

/**
* type: 覆盖还是融合
*/
public String putMetResource(String uri, String content, int status,
MetResourceTag type) throws DatabaseException {
try {
putMetResource(uri, content.getBytes("UTF-8"), status, type);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (DatabaseException e) {
e.printStackTrace();
}

if(flushCnt++ % 5==0){
myDbEnv.sync();
}

return MsgConstant.SUCCESS;
}

}



其中MetResourceBE的实现:

package com.rayeen.spider.vertical.data;


import java.io.Serializable;
import java.util.logging.Logger;


import com.sleepycat.persist.*;

import com.sleepycat.persist.model.*;

import com.sleepycat.persist.model.Entity;
import com.sleepycat.persist.model.PrimaryKey;
import com.sleepycat.persist.model.SecondaryKey;
import com.sleepycat.persist.model.Relationship;


@Entity
public class MetResourceBE implements Serializable{


@PrimaryKey(sequence="ID")
private String uri;

@SecondaryKey(relate=Relationship.MANY_TO_ONE)
private String digest;



private java.util.Date insertTime;
private int status;

final static public int MET=0x1;//url遇到过
final static public int SAVED=0x2;//url保存过
final static public int MODIFIED=0x4;//内容更新过

final static public int UNMET=0x8;//url未遇到过
final static public int UNSAVED=0x10;//url未保存过
final static public int UNMODIFIED=0x20;//内容未更新

final static public int BLANK=0x0;//未遇到过,未保存过,未更新过
final static public int FULL=MET|SAVED|MODIFIED|UNMET|UNSAVED|UNMODIFIED;// 遇到过, 保存过, 更新过
//




//final static public int MET_MODIFIED=0x5;//遇到url相同,内容不同的页面


// final static public int UNMET=0x0;//url遇到过,未更新过,未保存过
// final static public int UNSAVED_UNMODIFY=0x4;//遇到过,更新或未更新过,未保存过
// final static public int UNSAVED_MODIFY=0x6;//遇到过,更新或未更新过,未保存过


//final static public int UNSAVED=0x8;//url保存过



final static public int SAME_CONTENT=0x10;//遇到url不同,内容相同的页面
final static public int SAME_URL=0x20;//遇到url相同,内容不同的页面
// final static public int MODIFIED=0x40;//遇到url相同,内容不同的页面
// final static public int UNMODIFIED=0x80;//遇到url相同,内容不同的页面





public java.util.Date getInsertTime() {
return insertTime;
}

public void setInsertTime(java.util.Date insertTime) {
this.insertTime = insertTime;
}

public String getDigest() {
return digest;
}

public void setDigest(String digest) {
this.digest = digest;
}

public String getUri() {
return uri;
}

public void setUri(String uri) {
this.uri = uri;
}

public int getStatus() {
return status;
}

public void setStatus(int status) {
this.status = status;
}

}



[b]判断是否爬过某个页面的代码片段:[/b]


	// 在fetchedList非空的情况下才处理以下逻辑
if (null != fetchedList) {
int curStatus = fetchedList.getMetResource(realUrl, content
.getBytes("UTF-8"));

// 如果不应该处理这个页面,那么直接返回
if (ParseUtils.EntranceCantProcess(processStandard, curStatus)) {
log(MetResourceUtil.explainMetResourceReason(
url.getToUrl(), processStandard, curStatus),
LogType.ENTRANCE_CONDITIONAL);
return curStatus;
}
// 否则,记录这个页面的状态(met和unmet状态转换)
fetchedList.putMetResource(url.getToUrl(), content
.getBytes("UTF-8"), MetResourceBE.UNMET
| MetResourceBE.MET, MetResourceTag.MERGE);

}


[b]保存数据之后,修改URL状态的代码片段:[/b]

public void save(String saveStat) throws SemanticException {

// curHierarchyResultMap中保存着到上级页面为止的入口处的信息
// 之前的若干步骤都是为了填充curHierarchyResultMap里的数据
// 这里的curHierarchyResultMap是上层的crawl函数设进参数的HierarchyResultMap
// 本层的所有save函数共用这个curHierarchyResultMap

// curUri会被enter之后的处理过程还原出来,强行设置为
VMUtils.save(curUri, curPage, curHierarchyResultMap, curCrsc, saveStat);

// 保存“已保存过标记”
if (null != fetchedList) {
try {
fetchedList.putMetResource(curUrl.getToUrl(), curContent,
MetResourceBE.SAVED | MetResourceBE.UNSAVED,
MetResourceTag.MERGE);
} catch (DatabaseException e) {
e.printStackTrace();
}
}

}

[b]
一些位操作的辅助函数:[/b]




static TObjectIntHashMap bitmap = new TObjectIntHashMap();

static {
bitmap.put("遇到过", MetResourceBE.MET);
bitmap.put("更新过", MetResourceBE.MODIFIED);
bitmap.put("保存过", MetResourceBE.SAVED);

bitmap.put("未遇到过", MetResourceBE.UNMET);
bitmap.put("未更新过", MetResourceBE.UNMODIFIED);
bitmap.put("未保存过", MetResourceBE.UNSAVED);
}
/**
* 处理前三位数据 111(未遇到过||更新过||未保存过)& 001(未保存过) 返回前3位数据(位置上的后三位,逻辑上的前3位)
*
* @param standard
* @return
* @throws SemanticException
*/
public static int parseProcessStardard(String standard)
throws SemanticException {
if (null == standard)
return -1;

int idx = standard.indexOf(ConfConstant.PROCESS_STANDARD);
int status = 0;

if (idx != -1) {
standard = standard.substring(idx
+ ConfConstant.PROCESS_STANDARD.length() + 1);
}

String[] stdsOR = Pattern.compile("||", Pattern.LITERAL).split(
standard, 0);
int or = MetResourceBE.BLANK;
for (String strOR : stdsOR) {
strOR = strOR.trim();
int and = MetResourceBE.BLANK;
String[] stdsAND = Pattern.compile("&&", Pattern.LITERAL).split(
strOR, 0);
for (String strAnd : stdsAND) {
strAnd = strAnd.trim();
if (!bitmap.containsKey(strAnd)) {
ParalleIRVirtualMachine.error(
"error enterance strandard grammer:"
+ strAnd.substring(1), ErrorType.GRAMMER);
}
and |= bitmap.get(strAnd);
}

or = or | and;
}
status = or;

return status;
}

static public boolean canProceess(int standard, int status) {

return ((standard | status) & MetResourceBE.FULL) > 0;
}

static public boolean PageCanProceess(String standardStr, int status)
throws SemanticException {

int standard = parseProcessStardard(standardStr);
return ((standard | status) & MetResourceBE.FULL) > 0;
}

// 如果:所有遇到过的uri都不进入(不探测是否更新),则在met的情况下,cantEnter返回true
static public boolean EntranceCantEnter(String standardStr, int status)
throws SemanticException {
int standard = parseProcessStardard(standardStr);
return EntranceCantEnter(standard, status);
}

static public boolean EntranceCantEnter(int standard, int status) {

if (standard == -1)
return false;

// url遇到过,未更新过,未保存过
if (standard == MetResourceBE.UNMET && (status & MetResourceBE.MET) > 0) {
return true;
}
// 未保存过+更新过+遇到过+<->当前状态:未保存过+未更新过+遇到过+
//
return false;
}

// 如果标准是:更新过,而status是未更新过
// 或者标准是:未保存过,而status是保存过,那么,本页不需要处理

// 如果标准是:更新过,未保存过,status是未更新过,未保存过,那么,没有不能完全符合,也不能处理
// 不考虑是否遇到过这个页面

// 只处理标准中提到的信息,没提到的无所谓
// 是否需要多加一倍的字段数? 由3位01变成6位01?
static public boolean EntranceCantProcess(int standard, int status) {

if (standard == -1)
return false;

// "未遇到过"也就是,status中的"未遇到过"位必须是1
if ((standard & MetResourceBE.UNMET) > 0
&& (status & MetResourceBE.UNMET) > 0) {
return true;
}

// 是否更新过
if (((standard & MetResourceBE.MODIFIED) ^ (status & MetResourceBE.MODIFIED)) != 0) {
return true;
}

// 如果标准是:更新过||未保存过||未遇到过 , 而status是未更新过&&未保存过&&遇到过,
// 则不处理这个页面
if (standard == (MetResourceBE.MODIFIED | MetResourceBE.SAVED | MetResourceBE.MET)
&& status == (MetResourceBE.MODIFIED | MetResourceBE.SAVED)) {
return true;
}
return false;
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值