在Heritrix中抓取所需的定制类

对比一下以前的代码,基本格式都是相同的

http://blog.csdn.net/caoxu1987728/archive/2008/05/06/2402204.aspx

package  my.extractor;
import  java.io.BufferedReader;
import  java.io.IOException;
import  java.io.StringReader;
import  java.util.logging.Logger;
import  java.util.logging.Level;

import  org.apache.commons.httpclient.URIException;
import  org.archive.crawler.datamodel.CrawlURI;
import  org.archive.crawler.extractor.Extractor;
import  org.archive.crawler.extractor.ExtractorHTML;
import  org.archive.crawler.extractor.Link;

import  org.archive.crawler.settings.SimpleType;
import  org.archive.crawler.settings.Type;

import  org.archive.io.ReplayCharSequence;
import  org.archive.util.HttpRecorder;
import  org.archive.util.TextUtils;

public   class  Mobile163Extractor  extends  Extractor
{
    
protected   boolean  ignoreUnexceptionHTML = true ;
    
private   static   final  String url163 = " http://mobile.163.com/0011/product/0011000B/special/1/left.html " ;
    
private   static  Logger logger = Logger.getLogger(Mobile163Extractor. class .getName());
    
public  Mobile163Extractor(String name)
    {
        
this (name, " Mobile163 extractor.Extracts links from HTML documents " );
    }
    
public  Mobile163Extractor(String name,String description)
    {
        
super (name,description);
    }
    
protected   void  extract(CrawlURI curi)
    {
        String url
= curi.toString();
        
if (url.equals(url163))
        {
            ReplayCharSequence cs
= null ;
            
try
            {
                HttpRecorder hr
= curi.getHttpRecorder();
                
if (hr == null )
                {
                    
throw   new  IOException( " why is recorder null here " );
                }
                cs
= hr.getReplayCharSequence();
            }
            
catch (IOException e)
            {
                curi.addLocalizedError(
this .getName(),e, " failed get of replay char sequence " + curi.toString()
                        
+ "" + e.getMessage());
                logger.log(Level.SEVERE,
" failed get of replay char sequenece in "
                        
+ Thread.currentThread().getName(),e);
            }
            
if (cs == null )
            {
                
return ;
            }
            
            String content
= cs.toString();
            
try
            {
                BufferedReader reader
= new  BufferedReader( new  StringReader(content));
                String line
= reader.readLine();
                
while (line != null )
                {
                    
if (line.endsWith( " .html" " ))
                    {
                        String fullstr
= null ;
                        fullstr
= " http://mobile.163.com " + line.substring(line.indexOf( " url: " ) + 4 ,line.length() - 1 );
                        addLinkFromString(curi,fullstr,
"" ,Link.NAVLINK_HOP);
                        System.out.println(fullstr);
                    }
                    line
= reader.readLine();
                }
            }
            
catch (Exception e)
            {
                e.printStackTrace();
            }
        }
    }
   
private   void  addLinkFromString(CrawlURI curi,String url,CharSequence context, char  hopType)
   {
       
try
       {
           curi.createAndAddLinkRelativeToBase(url, context.toString(), hopType);
       }
       
catch (URIException e)
       {
           
if (getController() != null )
           {
               getController().logUriError(e,curi.getUURI(),url);
           }
           
else
           {
               logger.info(
" failed creatAndAddLinkRelativeToBase " + curi + " , " + url + " , " + context + " , " + hopType + " : " + e);
           }
       }
   }
}

在扩展一下善后工作吧

package  my.postprocessor;
import  java.util.logging.Logger;

import  org.archive.crawler.datamodel.CandidateURI;
import  org.archive.crawler.postprocessor.FrontierScheduler;

public   class  FrontierSchedulerFor163Mobile  extends  FrontierScheduler
{
    
private   static   final  Logger LOGGER = Logger.getLogger(FrontierSchedulerFor163Mobile. class .getName());
    
public  FrontierSchedulerFor163Mobile(String name)
    {
        
super (name);
    }
    
protected   void  schedule(CandidateURI caUri)
    {
        String url
= caUri.toString();
        
try
        {
            
if (url.indexOf( " mobile.163.com/0011/product/0011000B/product " ) !=- 1
                
|| url.indexOf( " mobile.163.com/0011/product/0011000B/mark " ) !=- 1
                    
|| url.endsWith( " .gif " )
                    
|| url.endsWith( " .jpg " )
                    
|| url.endsWith( " .jpeg " )
                    
|| url.indexOf( " robots.txt " ) !=- 1
                    
|| url.indexOf( " dns " ) !=- 1 )
            {
                
if (url.indexOf( " # " ) !=- 1 )
                    getController().getFrontier().schedule(caUri)
            }
            
else
            {
                
return ;
            }
        }
        
catch (Exception e)
        {
            e.printStackTrace();
        }
    }
}

这个就不多说了,与前一篇一个模型。

 

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值