一个月前写了一个主题爬虫,重点主要在对网页和内部链接分类上,其它部分的细节问题没有仔细研究。这个过程中我遇到一个很重要的问题:判断一个URL是否已经爬取过。对待这个问题,我开始的做法是最直接,也是最简单的:将已经爬行到的URL存入一个HashSet<String>之中,每次抓取网页之前都判断一下此URL是否在这个hash表中,如果已经存在,则放弃,否则,将其加入hash表并爬去此URL对应页面。乍一看起来似乎无可非议,但是随着url数量的增长, hashset的体积持续增加,查找速度也逐渐变慢。借鉴了一些成熟的解决方案后决定用内存做缓存,用数据库存储,来实现一个用LRU(least recently use)算法包装的hash表做成的过滤器来实现对URL的过滤。具体内容如下:
代码基于这样一个事实:1,一个网页之中包含的URL地址成簇出现(属于同一个IP地址)。2,集合A为某网站IP地址下经常被引用的URL集合,B为此网站中不经常被引用的URL集合。则待爬行的URL属于A集合的概率高于属于B的概率。
主要思想:根据以上两个事实:
1,将已爬行的url按IP地址分成不同集合,并记录其中每个ip地址被引用的次数(入度)
2,用内存中的hashmap作为缓存,用数据库做存储(为了加快速度,整个过程中判断的都是URL和IP地址的hashcode值) 。
3,内存中保存一部分IP地址和其对应的URL哈希表的映射。
4,用一个队列保存ip地址被命中的次数,当缓存容量大于指定值的时候,取出队列中最不常用(命中次数最小)的ip地址,
将其对应的 URL表存入数据库中。
5,对某个IP地址下缓存URL的数量做限制,如果超过某个值MAXIP,则将整个URL缓存表写入数据库,并从其中取出前MAXIP/2
个命中率最高的URL放入内存缓存。
过滤某URL步骤如下:
1,判断URL的ip地址是否被缓存,如果是,则用缓存的URL 哈希表过滤此URL。
2 ,如果URL的ip地址未被缓存,则在数据库中将ip对应的URL列表取出放入内存。然后进行1.
此过滤器的java代码如下:
package Processer;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Random;
import java.util.Map.Entry;
import database.DatabaseCon;
public class LRUFilter {
private int CurrentCacheSize;
private int MAX;
private int MAXIP;
private HashMap<Integer,HashMap> rootMap;
private HashMap<Integer,Int> DBMap;
private Connection con;
private LinkedList<Element> lQueue;
private HashMap<Integer,Element> slQueue;
public LRUFilter(Connection c){
this(20000,10000,c);
}
public LRUFilter(int capable,int oneip,Connection c){
MAX=capable;
CurrentCacheSize=0;
MAXIP=oneip;
rootMap=new HashMap<Integer,HashMap>();
DBMap=new HashMap<Integer,Int>();
lQueue=new LinkedList<Element>();
slQueue=new HashMap<Integer,Element>();
con=c;
initialCache();
}
private boolean hit(HashMap<Integer,DBElement> urlMap,int ip,int url){
boolean contain=false;
slQueue.get(ip).hits++;
DBElement dbe=null;
if((dbe=urlMap.get(url))!=null){
contain=true;
if(dbe.flag==0){
dbe.hits++;
dbe.flag=1;
}else
dbe.hits++;
}else{
if(DBMap.get(ip).value<MAXIP){
contain=false;
urlMap.put(url, new DBElement(1,ip,2));
CurrentCacheSize++;
while(CurrentCacheSize>MAX)
toDB(0);
if(urlMap.size()>MAXIP){
toDB(ip);
fromDB(ip);
}
}else{
contain=toDBDirect(url,new DBElement(1,ip,0));
Int i=null;
if(!contain&&(i=DBMap.get(ip))!=null)
i.value+=1;
}
}
return contain;
}
private boolean notHit(int ip,int url){
return hit(fromDB(ip),ip,url);
}
/*
* get the keyip and its count into the DBMap which are the count cache of the ip
*/
private void initialCache(){
try{
String sql="use crawler;select keyip,count(keyip) from visited group by keyip";
Statement stm=con.createStatement();
ResultSet rs=stm.executeQuery(sql);
while(rs.next())
DBMap.put(rs.getInt(1), new Int(rs.getInt(2)));
rs.close();
stm.close();
}catch(Exception e){
e.printStackTrace();
}
}
/*
* filter the url, ip is the url's ip.
*/
public boolean contain(String ip,String url){
HashMap<Integer,DBElement> urlMap=null;
int keyip=ip.hashCode();
int keyurl=url.hashCode();
if((urlMap=rootMap.get(keyip))!=null)
return hit(urlMap,keyip,keyurl);
else
return notHit(keyip,keyurl);
}
/*
* If ip is equal to 0,get the least recently use ip from the lQueue,writ all
* the url belong the ip to the database, change the number of url belong this ip in DBMap
* if ip is not equal to 0, write the url belong this ip to the database.renew the lQueue and slQueue and DBMap
*/
private boolean toDB(int ip){
HashMap<Integer,DBElement> urlMap=null;
Int count=null;
int num;
if(ip==0){
Collections.sort(lQueue,new MyComparator());
Element e=null;
if((e=lQueue.poll())!=null){
ip=e.ip;
slQueue.remove(ip);
if((urlMap=rootMap.remove(ip))!=null){
num=writeToDB(urlMap);
if((count=DBMap.get(ip))!=null)
count.value+=num;
CurrentCacheSize-=urlMap.size();
}
}else
return false;//empty
}else{
if((urlMap=rootMap.remove(ip))!=null){
num=writeToDB(urlMap);
if((count=DBMap.get(ip))!=null)
count.value+=num;
CurrentCacheSize-=urlMap.size();
}
lQueue.remove(slQueue.remove(ip));
}
return true;
}
private HashMap<Integer,DBElement> fromDB(int ip){
Int i=null;
HashMap<Integer,DBElement> urlMap=null;
if((i=DBMap.get(ip))!=null){
if(i.value>MAXIP)
urlMap=readFromDB(ip,true);
else
urlMap=readFromDB(ip,false);
}else{
urlMap=new HashMap<Integer,DBElement>();
DBMap.put(ip, new Int(0));
}
while(urlMap.size()+CurrentCacheSize>MAX)
toDB(0);
for(int j=0;j<lQueue.size();j++)
lQueue.get(j).hits=0;
Element e=new Element(ip,0);
lQueue.add(e);
slQueue.put(ip, e);
rootMap.put(ip, urlMap);
CurrentCacheSize+=urlMap.size();
while(CurrentCacheSize>MAX)
toDB(0);
return urlMap;
}
/*
* write to database,return the number of
* record which is inserted into the database
*/
private int writeToDB(HashMap<Integer,DBElement> urlDBMap){
boolean insertAble=false,updateAble=false;
int num=0;
try{
PreparedStatement insertStm=con.prepareStatement("use crawler;insert into visited values(?,?,?);");
DBElement dbe=null;
for(Iterator i=urlDBMap.entrySet().iterator();i.hasNext();){
Entry<Integer,DBElement> entry=(Entry<Integer,DBElement>)i.next();
int keyurl=entry.getKey();
DBElement e=entry.getValue();
if(e.flag==2){
insertStm.setInt(1, keyurl);
insertStm.setInt(2,e.hits);
insertStm.setInt(3, e.keyip);
insertStm.addBatch();
num++;
insertAble=true;
}
}
if(insertAble)
insertStm.executeBatch();
insertStm.close();
PreparedStatement updateStm=con.prepareStatement("use crawler;update visited set hits=? where keyurl=?;");
for(Iterator i=urlDBMap.entrySet().iterator();i.hasNext();){
Entry<Integer,DBElement> entry=(Entry<Integer,DBElement>)i.next();
int keyurl=entry.getKey();
DBElement e=entry.getValue();
if(e.flag==1){
updateStm.setInt(1, e.hits);
updateStm.setInt(2, keyurl);
updateStm.addBatch();
updateAble=true;
}
}
if(updateAble)
updateStm.executeBatch();
updateStm.close();
con.commit();
}catch(Exception e){
e.printStackTrace();
}
return num;
}
/*
* read from database,if the number of the record
* which belong to the ip exceed the MAXIP,just read
* half of it from the database
*/
public HashMap<Integer,DBElement> readFromDB(int ip,boolean exceed){
HashMap<Integer,DBElement> urlMap=new HashMap<Integer,DBElement>();
String sql=null;
int count=MAXIP/2;
if(exceed)
sql="select top "+count+" keyurl,hits from visited where keyip=?;";
else
sql="select keyurl,hits from visited where keyip=?;";
try{
PreparedStatement stm=con.prepareStatement(sql);
stm.setInt(1, ip);
ResultSet rs=stm.executeQuery();
while(rs.next())
urlMap.put(rs.getInt(1),new DBElement(rs.getInt(2)));
rs.close();
stm.close();
}catch(Exception e){
e.printStackTrace();
}
return urlMap;
}
/*
* insert into the database directly
*/
private boolean toDBDirect(int keyurl,DBElement dbe){
boolean contain=false;
try{
Statement stm=con.createStatement();
String sql=null;
ResultSet rs=stm.executeQuery("use crawler;select hits from visited where keyurl="+keyurl+";");
if(rs.next()){
contain=true;
int hits=rs.getInt(1)+dbe.hits;
sql="use crawler;update visited set hits="+hits+" where keyurl="+keyurl+";";
}else{
contain=false;
sql="use crawler;insert into visited values("+keyurl+","+dbe.hits+","+dbe.keyip+");";
}
stm.executeUpdate(sql);
rs.close();
stm.close();
}catch(Exception e){
e.printStackTrace();
}
return contain;
}
/*
* store the cache data
*/
public void store(){
while(toDB(0));
}
/*
* tool classes
*/
private class Element{
public Element(int i,int h){ip=i;hits=h;}
public int ip;
public int hits;
}
private class Int{
public Int(int v){value=v;}
public int value=0;
}
private class DBElement{
public DBElement(int h,int k,int f){hits=h;keyip=k;flag=f;}
public DBElement(int h){hits=h;}
public int hits=0;
public int keyip=0;
/*
* 0 stand for not change
* 1 stand for change
* 2 stand for a new record
*/
public int flag=0;;
}
private class MyComparator implements Comparator{
public int compare(Object o1,Object o2){
Element e1=(Element)o1;
Element e2=(Element)o2;
if(e1.hits<e1.hits)
return -1;
else if(e1.hits>e2.hits)
return 1;
else
return 0;
}
}
}