在Heritrix中抓取所需的定制类

最新推荐文章于 2021-02-12 19:55:44 发布

caoxu1987728

最新推荐文章于 2021-02-12 19:55:44 发布

阅读量1.3k

点赞数

分类专栏： All Spiders 文章标签： mobile null url 扩展工作

本文链接：https://blog.csdn.net/caoxu1987728/article/details/2428453

版权

All Spiders 专栏收录该内容

42 篇文章 0 订阅

订阅专栏

对比一下以前的代码，基本格式都是相同的

http://blog.csdn.net/caoxu1987728/archive/2008/05/06/2402204.aspx

package my.extractor;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.logging.Logger;
import java.util.logging.Level;

import org.apache.commons.httpclient.URIException;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.extractor.Extractor;
import org.archive.crawler.extractor.ExtractorHTML;
import org.archive.crawler.extractor.Link;

import org.archive.crawler.settings.SimpleType;
import org.archive.crawler.settings.Type;

import org.archive.io.ReplayCharSequence;
import org.archive.util.HttpRecorder;
import org.archive.util.TextUtils;

public class Mobile163Extractor extends Extractor
{
     protected boolean ignoreUnexceptionHTML = true ;
     private static final String url163 = " http://mobile.163.com/0011/product/0011000B/special/1/left.html " ;
     private static Logger logger = Logger.getLogger(Mobile163Extractor. class .getName());
     public Mobile163Extractor(String name)
    {
         this (name, " Mobile163 extractor.Extracts links from HTML documents " );
    }
     public Mobile163Extractor(String name,String description)
    {
         super (name,description);
    }
     protected void extract(CrawlURI curi)
    {
        String url = curi.toString();
         if (url.equals(url163))
        {
            ReplayCharSequence cs = null ;
             try
            {
                HttpRecorder hr = curi.getHttpRecorder();
                 if (hr == null )
                {
                     throw new IOException( " why is recorder null here " );
                }
                cs = hr.getReplayCharSequence();
            }
             catch (IOException e)
            {
                curi.addLocalizedError( this .getName(),e, " failed get of replay char sequence " + curi.toString()
                         + "" + e.getMessage());
                logger.log(Level.SEVERE, " failed get of replay char sequenece in "
                         + Thread.currentThread().getName(),e);
            }
             if (cs == null )
            {
                 return ;
            }

            String content = cs.toString();
             try
            {
                BufferedReader reader = new BufferedReader( new StringReader(content));
                String line = reader.readLine();
                 while (line != null )
                {
                     if (line.endsWith( " .html" " ))
                    {
                        String fullstr = null ;
                        fullstr = " http://mobile.163.com " + line.substring(line.indexOf( " url: " ) + 4 ,line.length() - 1 );
                        addLinkFromString(curi,fullstr, "" ,Link.NAVLINK_HOP);
                        System.out.println(fullstr);
                    }
                    line = reader.readLine();
                }
            }
             catch (Exception e)
            {
                e.printStackTrace();
            }
        }
    }
    private void addLinkFromString(CrawlURI curi,String url,CharSequence context, char hopType)
   {
        try
       {
           curi.createAndAddLinkRelativeToBase(url, context.toString(), hopType);
       }
        catch (URIException e)
       {
            if (getController() != null )
           {
               getController().logUriError(e,curi.getUURI(),url);
           }
            else
           {
               logger.info( " failed creatAndAddLinkRelativeToBase " + curi + " , " + url + " , " + context + " , " + hopType + " : " + e);
           }
       }
   }
}

在扩展一下善后工作吧

package my.postprocessor;
import java.util.logging.Logger;

import org.archive.crawler.datamodel.CandidateURI;
import org.archive.crawler.postprocessor.FrontierScheduler;

public class FrontierSchedulerFor163Mobile extends FrontierScheduler
{
     private static final Logger LOGGER = Logger.getLogger(FrontierSchedulerFor163Mobile. class .getName());
     public FrontierSchedulerFor163Mobile(String name)
    {
         super (name);
    }
     protected void schedule(CandidateURI caUri)
    {
        String url = caUri.toString();
         try
        {
             if (url.indexOf( " mobile.163.com/0011/product/0011000B/product " ) !=- 1
                 || url.indexOf( " mobile.163.com/0011/product/0011000B/mark " ) !=- 1
                     || url.endsWith( " .gif " )
                     || url.endsWith( " .jpg " )
                     || url.endsWith( " .jpeg " )
                     || url.indexOf( " robots.txt " ) !=- 1
                     || url.indexOf( " dns " ) !=- 1 )
            {
                 if (url.indexOf( " # " ) !=- 1 )
                    getController().getFrontier().schedule(caUri)
            }
             else
            {
                 return ;
            }
        }
         catch (Exception e)
        {
            e.printStackTrace();
        }
    }
}

这个就不多说了，与前一篇一个模型。