Java开源Spider

构建于lucene之上的可用的Java开源Spider少之又少,spindle长期没有更新且功能不够完善,故而自己参考其源
代码重新编写了一个可扩展的WebCrawler,本着开源共享,共同进步的想法发布于此,期冀得到大家的批评指正,
有任何意见及建议均可Email联系我 (kaninebruno@hotmail.com)
   以下代码基于lucene-2.3.1,htmlparser-1.6,je-analysis-1.5.3,以及自己修改过的cpdetector-1.0.5;
下载地址分别为
htmlparser:http://sourceforge.net/project/showfiles.php?group_id=24399
je-analysis:http://www.jesoft.cn/je-analysis-1.5.3.jar
lucene就不用说了,cpdetector-1.0.5见附件.
spindle的官方站点:http://www.bitmechanic.com/projects/spindle/

Java代码 复制代码
  1. package com.huizhi.kanine.util;   
  2.   
  3. import java.io.BufferedReader;   
  4. import java.io.File;   
  5. import java.io.FileNotFoundException;   
  6. import java.io.IOException;   
  7. import java.io.InputStream;   
  8. import java.io.InputStreamReader;   
  9. import java.io.UnsupportedEncodingException;   
  10. import java.net.HttpURLConnection;   
  11. import java.net.MalformedURLException;   
  12. import java.net.SocketException;   
  13. import java.net.SocketTimeoutException;   
  14. import java.net.URL;   
  15. import java.net.UnknownHostException;   
  16. import java.nio.charset.Charset;   
  17. import java.util.ArrayList;   
  18. import java.util.Date;   
  19. import java.util.HashSet;   
  20.   
  21. import jeasy.analysis.MMAnalyzer;   
  22.   
  23. import org.apache.lucene.analysis.Analyzer;   
  24. import org.apache.lucene.document.DateTools;   
  25. import org.apache.lucene.document.Document;   
  26. import org.apache.lucene.document.Field;   
  27. import org.apache.lucene.index.CorruptIndexException;   
  28. import org.apache.lucene.index.IndexReader;   
  29. import org.apache.lucene.index.IndexWriter;   
  30. import org.apache.lucene.index.Term;   
  31. import org.apache.lucene.search.Hits;   
  32. import org.apache.lucene.search.IndexSearcher;   
  33. import org.apache.lucene.search.TermQuery;   
  34. import org.apache.lucene.store.Directory;   
  35. import org.apache.lucene.store.LockObtainFailedException;   
  36. import org.apache.lucene.store.RAMDirectory;   
  37. import org.htmlparser.Parser;   
  38. import org.htmlparser.PrototypicalNodeFactory;   
  39. import org.htmlparser.filters.AndFilter;   
  40. import org.htmlparser.filters.HasAttributeFilter;   
  41. import org.htmlparser.filters.NodeClassFilter;   
  42. import org.htmlparser.tags.BaseHrefTag;   
  43. import org.htmlparser.tags.FrameTag;   
  44. import org.htmlparser.tags.LinkTag;   
  45. import org.htmlparser.tags.MetaTag;   
  46. import org.htmlparser.util.EncodingChangeException;   
  47. import org.htmlparser.util.NodeIterator;   
  48. import org.htmlparser.util.NodeList;   
  49. import org.htmlparser.util.ParserException;   
  50. import org.htmlparser.visitors.HtmlPage;   
  51.   
  52. import cpdetector.io.ASCIIDetector;   
  53. import cpdetector.io.CodepageDetectorProxy;   
  54. import cpdetector.io.JChardetFacade;   
  55. import cpdetector.io.ParsingDetector;   
  56. import cpdetector.io.UnicodeDetector;   
  57.   
  58.   
  59. /**  
  60.  * @author 张波   
  61.  * E-mail:kaninebruno@hotmail.com   
  62.  * Created On : 2008-03-30  
  63.  */  
  64. public class SiteCapturer implements Runnable{   
  65.        
  66.     /* 基准(初始)URL */  
  67.     protected URL mSource;   
  68.   
  69.     /* 索引文件的存放位置 */  
  70.     protected String mTarget;   
  71.   
  72.     /**  
  73.      * 待解析的URL地址集合,所有新检测到的链接均存放于此;  
  74.      * 解析时按照先入先出(First-In First-Out)法则线性取出  
  75.      */  
  76.     protected ArrayList mPages;   
  77.   
  78.     /* 已解析的URL地址集合,避免链接的重复抓取 */  
  79.     protected HashSet mFinished;   
  80.   
  81.     protected Parser mParser;   
  82.        
  83.     /* StringBuffer的缓冲区大小 */  
  84.     protected  final int TRANSFER_SIZE = 4096;   
  85.        
  86.     /* 当前平台的行分隔符 */  
  87.     protected static String lineSep = System.getProperty("line.separator");   
  88.        
  89.     /* 程序运行线程数,默认2个线程 */  
  90.     protected int mthreads;   
  91.        
  92.     protected ArrayList threadList;   
  93.        
  94.     /* 存储于磁盘的IndexWriter */  
  95.     protected IndexWriter FSDWriter;   
  96.        
  97.     /* 存储于内存的IndexWriter */  
  98.     protected IndexWriter RAMWriter;   
  99.   
  100.     protected IndexSearcher indexSearcher;   
  101.   
  102.     protected RAMDirectory ramDirectory;   
  103.        
  104.     /* 筛选页面内容的分词器 */  
  105.     protected Analyzer luceneAnalyzer;   
  106.   
  107.     /* 解析页面时的字符编码 */  
  108.     protected String charset;   
  109.        
  110.     /* 统计已抓取的页面数量 */  
  111.     protected int count = 0;   
  112.        
  113.     /* 基准端口 */  
  114.     protected int mPort;   
  115.        
  116.     /* 基准主机 */  
  117.     protected String mHost;   
  118.        
  119.     /* 检测索引中是否存在当前URL信息,避免重复抓取 */  
  120.     protected boolean mCheck;   
  121.   
  122.     /* 索引操作的写入线程锁 */  
  123.     public static final Object indexLock = new Object();   
  124.        
  125.     public SiteCapturer() {   
  126.         mSource = null;   
  127.         mTarget = null;   
  128.         mthreads = 2;   
  129.         mCheck = false;   
  130.         mPages = new ArrayList();   
  131.         mFinished = new HashSet();   
  132.         mParser = new Parser();   
  133.         PrototypicalNodeFactory factory = new PrototypicalNodeFactory();   
  134.         factory.registerTag(new LocalLinkTag());   
  135.         factory.registerTag(new LocalFrameTag());   
  136.         factory.registerTag(new LocalBaseHrefTag());   
  137.         mParser.setNodeFactory(factory);   
  138.     }   
  139.   
  140.     public String getSource() {   
  141.         return mSource.toString();   
  142.     }   
  143.   
  144.     public void setSource(String source) {   
  145.         if (source.endsWith("/"))   
  146.             source = source.substring(0, source.length() - 1);   
  147.         try {   
  148.             mSource = new URL(source);   
  149.         } catch (MalformedURLException e) {   
  150.             System.err.println("Invalid URL : " + getSource());   
  151.         }   
  152.     }   
  153.   
  154.     public String getTarget() {   
  155.         return (mTarget);   
  156.     }   
  157.   
  158.     public void setTarget(String target) {   
  159.         mTarget = target;   
  160.     }   
  161.        
  162.     public int getThreads() {   
  163.         return (mthreads);   
  164.     }   
  165.   
  166.     public void setThreads(int threads) {   
  167.         mthreads = threads;   
  168.     }   
  169.        
  170.     public boolean isMCheck() {   
  171.         return mCheck;   
  172.     }   
  173.   
  174.     public void setMCheck(boolean check) {   
  175.         mCheck = check;   
  176.     }   
  177.   
  178.     /**  
  179.      * 程序入口,在此初始化mPages、IndexWriter  
  180.      * 通过协调各线程间的活动完成website的抓取工作  
  181.      * 任务完成后将所有的索引片段合并为一个以优化检索  
  182.      */  
  183.     public void capture(){   
  184.   
  185.         mPages.clear();   
  186.         mPages.add(getSource());   
  187.            
  188.         int responseCode = 0;   
  189.         String contentType = "";   
  190.            
  191.         try {   
  192.             HttpURLConnection uc = (HttpURLConnection) mSource.openConnection();   
  193.             responseCode = uc.getResponseCode();   
  194.             contentType = uc.getContentType();   
  195.         } catch (MalformedURLException mue) {   
  196.             System.err.println("Invalid URL : " + getSource());   
  197.         } catch (IOException ie) {   
  198.             if (ie instanceof UnknownHostException) {   
  199.                 System.err.println("UnknowHost : " + getSource());   
  200.             } else if (ie instanceof SocketException) {   
  201.                 System.err.println("Socket Error : " + ie.getMessage() + " "  
  202.                         + getSource());   
  203.             } else  
  204.                 ie.printStackTrace();   
  205.         }   
  206.            
  207.         if (responseCode == HttpURLConnection.HTTP_OK   
  208.                 && contentType.startsWith("text/html")) {   
  209.                
  210.             mPort = mSource.getPort();   
  211.             mHost = mSource.getHost();   
  212.             charset = autoDetectCharset(mSource);   
  213.   
  214.             /* 存放索引文件的位置 */  
  215.             File indexDir = new File(mTarget);   
  216.             /* 标记是否重新建立索引,true为重新建立索引 */  
  217.             boolean flag = true;   
  218.             if (!indexDir.exists()) {   
  219.                 /* 如果文件夹不存在则创建 */  
  220.                 indexDir.mkdir();   
  221.             } else if (IndexReader.indexExists(mTarget)) {   
  222.                 /* 如果已存在索引,则追加索引 */  
  223.                 flag = false;   
  224.                 File lockfile = new File(mTarget + File.separator + "write.lock");   
  225.                 if (lockfile.exists())   
  226.                     lockfile.delete();   
  227.             }   
  228.             luceneAnalyzer = new MMAnalyzer();   
  229.             ramDirectory = new RAMDirectory();   
  230.   
  231.             try {   
  232.                 FSDWriter = new IndexWriter(indexDir, luceneAnalyzer, flag);   
  233.                 RAMWriter = new IndexWriter(ramDirectory, luceneAnalyzer, true);   
  234.                    
  235.                 while (mCheck) {   
  236.                     IndexReader indexReader = IndexReader.open(mTarget);   
  237.                     indexSearcher = new IndexSearcher(indexReader);   
  238.                 }   
  239.                    
  240.                 long start = System.currentTimeMillis();   
  241.                 threadList = new ArrayList();   
  242.   
  243.                 for (int i = 0; i < mthreads; i++) {   
  244.                     Thread t = new Thread(this"K-9 Spider Thread #" + (i + 1));   
  245.                     t.start();   
  246.                     threadList.add(t);   
  247.                 }   
  248.                 while (threadList.size() > 0) {   
  249.                     Thread child = (Thread) threadList.remove(0);   
  250.                     try {   
  251.                         child.join();   
  252.                     } catch (InterruptedException e) {   
  253.                         e.printStackTrace();   
  254.                     }   
  255.                 }   
  256.                 long elapsed = System.currentTimeMillis() - start;   
  257.   
  258.                 RAMWriter.close();   
  259.                 FSDWriter.addIndexes(new Directory[] { ramDirectory });   
  260.                 FSDWriter.optimize();   
  261.                 FSDWriter.close();   
  262.   
  263.                 System.out.println("Finished in " + (elapsed / 1000)   
  264.                         + " seconds");   
  265.                 System.out.println("The Count of the Links Captured is "  
  266.                         + count);   
  267.             } catch (CorruptIndexException cie) {   
  268.                 cie.printStackTrace();   
  269.             } catch (LockObtainFailedException lofe) {   
  270.                 lofe.printStackTrace();   
  271.             } catch (IOException ie) {   
  272.                 ie.printStackTrace();   
  273.             }   
  274.         }       
  275.     }   
  276.        
  277.     public void run() {   
  278.         String url;   
  279.         while ((url = dequeueURL()) != null) {   
  280.             if (isToBeCaptured(url))   
  281.                 process(url);   
  282.         }   
  283.         mthreads--;   
  284.     }   
  285.   
  286.     /**  
  287.      * 判断提取到的链接是否符合解析条件;标准为Port及Host与基准URL相同且类型为text/html或text/plain  
  288.      */  
  289.     public boolean isToBeCaptured (String url){   
  290.         boolean flag = false;   
  291.            
  292.         HttpURLConnection uc = null;   
  293.         int responseCode = 0;   
  294.         String contentType = "";   
  295.         String host = "";   
  296.         int port = 0;   
  297.            
  298.         try {   
  299.             URL source = new URL(url);   
  300.             String protocol = source.getProtocol();   
  301.             if (protocol != null && protocol.equals("http")) {   
  302.                 host = source.getHost();   
  303.                 port = source.getPort();   
  304.                 uc = (HttpURLConnection) source.openConnection();   
  305.                 uc.setConnectTimeout(8000);   
  306.                 responseCode = uc.getResponseCode();   
  307.                 contentType = uc.getContentType();   
  308.             }   
  309.         } catch (MalformedURLException mue) {   
  310.             System.err.println("Invalid URL : " + url);   
  311.         } catch (IOException ie) {   
  312.             if (ie instanceof UnknownHostException) {   
  313.                 System.err.println("UnknowHost : " + url);   
  314.             } else if (ie instanceof SocketException) {   
  315.                 System.err.println("Socket Error : " + ie.getMessage() + " "  
  316.                         + url);   
  317.             } else if (ie instanceof SocketTimeoutException) {   
  318.                 System.err.println("Socket Connection Time Out : " + url);   
  319.             } else if (ie instanceof FileNotFoundException) {   
  320.                 System.err.println("broken link "  
  321.                         + ((FileNotFoundException) ie.getCause()).getMessage()   
  322.                         + " ignored");   
  323.             } else  
  324.                 ie.printStackTrace();   
  325.         }   
  326.            
  327.         if (port == mPort   
  328.                 && responseCode == HttpURLConnection.HTTP_OK   
  329.                 && host.equals(mHost)   
  330.                 && (contentType.startsWith("text/html") || contentType   
  331.                         .startsWith("text/plain")))   
  332.             flag = true;   
  333.         return flag;   
  334.     }   
  335.   
  336.     /* 从URL队列mPages里取出单个的URL */  
  337.     public synchronized String dequeueURL() {   
  338.         while (true) {   
  339.             if (mPages.size() > 0) {   
  340.                 String url = (String) mPages.remove(0);   
  341.                 mFinished.add(url);   
  342.                    
  343.                 if (isToBeCaptured(url)) {   
  344.                     int bookmark;   
  345.                     NodeList list;   
  346.                     NodeList robots;   
  347.                     MetaTag robot;   
  348.                     String content;   
  349.                     try {   
  350.                         bookmark = mPages.size();   
  351.                         /* 获取页面所有节点 */  
  352.                         mParser.setURL(url);   
  353.                         try {   
  354.                             list = new NodeList();   
  355.                             for (NodeIterator e = mParser.elements(); e   
  356.                                     .hasMoreNodes();)   
  357.                                 list.add(e.nextNode());   
  358.                         } catch (EncodingChangeException ece) {   
  359.                             /* 解码出错的异常处理 */  
  360.                             mParser.reset();   
  361.                             list = new NodeList();   
  362.                             for (NodeIterator e = mParser.elements(); e   
  363.                                     .hasMoreNodes();)   
  364.                                 list.add(e.nextNode());   
  365.                         }   
  366.                         /**  
  367.                          * 依据 http://www.robotstxt.org/wc/meta-user.html 处理  
  368.                          * Robots  tag  
  369.                          */  
  370.                         robots = list   
  371.                                 .extractAllNodesThatMatch(   
  372.                                         new AndFilter(new NodeClassFilter(   
  373.                                                 MetaTag.class),   
  374.                                                 new HasAttributeFilter("name",   
  375.                                                         "robots")), true);   
  376.                         if (0 != robots.size()) {   
  377.                             robot = (MetaTag) robots.elementAt(0);   
  378.                             content = robot.getAttribute("content")   
  379.                                     .toLowerCase();   
  380.                             if ((-1 != content.indexOf("none"))   
  381.                                     || (-1 != content.indexOf("nofollow")))   
  382.                                 for (int i = bookmark; i < mPages.size(); i++)   
  383.                                     mPages.remove(i);   
  384.                         }   
  385.                     } catch (ParserException pe) {   
  386.                         pe.printStackTrace();   
  387.                     }   
  388.                 }   
  389.                 return url;   
  390.             } else {   
  391.                 mthreads--;   
  392.                 if (mthreads > 0) {   
  393.                     try {   
  394.                         wait();   
  395.                         mthreads++;   
  396.                     } catch (InterruptedException ie) {   
  397.                         ie.printStackTrace();   
  398.                     }   
  399.                 } else {   
  400.                     notifyAll();   
  401.                     return null;   
  402.                 }   
  403.             }   
  404.         }   
  405.     }   
  406.   
  407.     /**  
  408.      * 处理单独的URL地址,解析页面并加入到lucene索引中;通过自动探测页面编码保证抓取工作的顺利执行  
  409.      */  
  410.     protected void process(String url) {   
  411.            
  412.         String result[];   
  413.         String content = null;   
  414.         String title = null;   
  415.   
  416.         /* 此项操作较耗性能,故默认不予检测 */  
  417.         if (mCheck) {   
  418.             try {   
  419.                 TermQuery query = new TermQuery(new Term("url", url));   
  420.                 Hits hits = indexSearcher.search(query);   
  421.                 if (hits.length() > 0) {   
  422.                     System.out.println("The URL : " + url   
  423.                             + " has already been captured");   
  424.                 } else {   
  425.                     result = parseHtml(url, charset);   
  426.                     content = result[0];   
  427.                     title = result[1];   
  428.                 }   
  429.             } catch (IOException ie) {   
  430.                 ie.printStackTrace();   
  431.             }   
  432.         } else {   
  433.             result = parseHtml(url, charset);   
  434.             content = result[0];   
  435.             title = result[1];   
  436.         }   
  437.            
  438.         if (content != null && content.trim().length() > 0) {   
  439.   
  440.             Document document = new Document();   
  441.             document.add(new Field("content", content, Field.Store.YES,   
  442.                     Field.Index.TOKENIZED,   
  443.                     Field.TermVector.WITH_POSITIONS_OFFSETS));   
  444.             document.add(new Field("url", url, Field.Store.YES,   
  445.                     Field.Index.UN_TOKENIZED));   
  446.             document.add(new Field("title", title, Field.Store.YES,   
  447.                     Field.Index.TOKENIZED,   
  448.                     Field.TermVector.WITH_POSITIONS_OFFSETS));   
  449.             document.add(new Field("date", DateTools.timeToString(new Date()   
  450.                     .getTime(), DateTools.Resolution.DAY), Field.Store.YES,   
  451.                     Field.Index.UN_TOKENIZED));   
  452.                
  453.             synchronized (indexLock) {   
  454.                 try {   
  455.                     RAMWriter.addDocument(document);   
  456.                     /**  
  457.                      * 当存放索引的内存使用大于指定值时将其写入硬盘;采用此方法的目的是  
  458.                      * 通过内存缓冲避免频繁的IO操作,提高索引创建性能;  
  459.                      */  
  460.                     if (RAMWriter.ramSizeInBytes() > 512 * 1024) {   
  461.                         RAMWriter.close();   
  462.                         FSDWriter.addIndexes(new Directory[] { ramDirectory });   
  463.                         RAMWriter = new IndexWriter(ramDirectory,   
  464.                                 luceneAnalyzer, true);   
  465.                     }   
  466.                     count++;   
  467.                     System.out.println(Thread.currentThread().getName()   
  468.                             + ": Finished Indexing URL: " + url);   
  469.                 } catch (CorruptIndexException cie) {   
  470.                     cie.printStackTrace();   
  471.                 } catch (IOException ie) {   
  472.                     ie.printStackTrace();   
  473.                 }   
  474.             }   
  475.         }   
  476.     }   
  477.   
  478.     /**  
  479.      * Link tag that rewrites the HREF.  
  480.      * The HREF is changed to a local target if it matches the source.  
  481.      */  
  482.     class LocalLinkTag extends LinkTag {   
  483.         public void doSemanticAction() {   
  484.   
  485.             String link = getLink();   
  486.             if (link.endsWith("/"))   
  487.                 link = link.substring(0, link.length() - 1);   
  488.             int pos = link.indexOf("#");   
  489.             if (pos != -1)   
  490.                 link = link.substring(0, pos);   
  491.   
  492.             /* 将链接加入到处理队列中 */  
  493.             if (!(mFinished.contains(link) || mPages.contains(link)))   
  494.                 mPages.add(link);   
  495.   
  496.             setLink(link);   
  497.         }   
  498.     }   
  499.   
  500.     /**  
  501.      * Frame tag that rewrites the SRC URLs. The SRC URLs are mapped to local  
  502.      * targets if they match the source.  
  503.      */  
  504.     class LocalFrameTag extends FrameTag {   
  505.         public void doSemanticAction() {   
  506.   
  507.             String link = getFrameLocation();   
  508.             if (link.endsWith("/"))   
  509.                 link = link.substring(0, link.length() - 1);   
  510.             int pos = link.indexOf("#");   
  511.             if (pos != -1)   
  512.                 link = link.substring(0, pos);   
  513.   
  514.             /* 将链接加入到处理队列中 */  
  515.             if (!(mFinished.contains(link) || mPages.contains(link)))   
  516.                 mPages.add(link);   
  517.   
  518.             setFrameLocation(link);   
  519.         }   
  520.     }   
  521.   
  522.     /**  
  523.      * Base tag that doesn't show. The toHtml() method is overridden to return  
  524.      * an empty string, effectively shutting off the base reference.  
  525.      */  
  526.     class LocalBaseHrefTag extends BaseHrefTag {   
  527.            
  528.         public String toHtml() {   
  529.             return ("");   
  530.         }   
  531.     }   
  532.        
  533.     /* 自动探测页面编码,避免中文乱码的出现 */  
  534.     protected String autoDetectCharset(URL url) {   
  535.            
  536.         CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();   
  537.         /**  
  538.          * ParsingDetector可用于检查HTML、XML等文件或字符流的编码  
  539.          * 构造方法中的参数用于指示是否显示探测过程的详细信息  
  540.          * 为false则不显示  
  541.          */    
  542.         detector.add(new ParsingDetector(false));   
  543.         detector.add(JChardetFacade.getInstance());   
  544.         detector.add(ASCIIDetector.getInstance());   
  545.         detector.add(UnicodeDetector.getInstance());   
  546.            
  547.         Charset charset = null;   
  548.         try {   
  549.             charset = detector.detectCodepage(url);   
  550.         } catch (MalformedURLException mue) {   
  551.             mue.printStackTrace();   
  552.         } catch (IOException ie) {   
  553.             ie.printStackTrace();   
  554.         }   
  555.         if (charset == null)   
  556.             charset = Charset.defaultCharset();   
  557.         return charset.name();   
  558.     }   
  559.   
  560.     /* 按照指定编码解析标准的html页面,为建立索引做准备*/  
  561.     protected String[] parseHtml(String url, String charset) {   
  562.   
  563.         String result[] = null;   
  564.         String content = null;   
  565.            
  566.         try {   
  567.             URL source = new URL(url);   
  568.             InputStream in = source.openStream();   
  569.             BufferedReader reader = new BufferedReader(new InputStreamReader(   
  570.                     in, charset));   
  571.             String line = new String();   
  572.             StringBuffer temp = new StringBuffer(TRANSFER_SIZE);   
  573.             while ((line = reader.readLine()) != null) {   
  574.                 temp.append(line);   
  575.                 temp.append(lineSep);   
  576.             }   
  577.             reader.close();   
  578.             in.close();   
  579.             content = temp.toString();   
  580.         } catch (MalformedURLException mue) {   
  581.             System.err.println("Invalid URL : " + url);   
  582.         } catch (UnsupportedEncodingException uee) {   
  583.             uee.printStackTrace();   
  584.         } catch (IOException ie) {   
  585.             if (ie instanceof UnknownHostException) {   
  586.                 System.err.println("UnknowHost : " + url);   
  587.             } else if (ie instanceof SocketException) {   
  588.                 System.err.println("Socket Error : " + ie.getMessage() + " "  
  589.                         + url);   
  590.             } else if (ie instanceof SocketTimeoutException) {   
  591.                 System.err.println("Socket Connection Time Out : " + url);   
  592.             } else  
  593.                 ie.printStackTrace();   
  594.         }   
  595.   
  596.         if (content != null) {   
  597.             Parser myParser = Parser.createParser(content, charset);   
  598.             HtmlPage visitor = new HtmlPage(myParser);   
  599.             try {   
  600.                 myParser.visitAllNodesWith(visitor);   
  601.                 String body = null;   
  602.                 String title = "Untitled";   
  603.                 if (visitor.getBody() != null) {   
  604.                     NodeList nodelist = visitor.getBody();   
  605.                     body = nodelist.asString().trim();   
  606.                 }   
  607.                 if (visitor.getTitle() != null)   
  608.                     title = visitor.getTitle();   
  609.                 result = new String[] { body, title };   
  610.             } catch (ParserException pe) {   
  611.                 pe.printStackTrace();   
  612.             }   
  613.         }   
  614.         return result;   
  615.     }   
  616.        
  617.     public static void main(String[] args) {   
  618.         SiteCapturer worker = new SiteCapturer();   
  619.            
  620.         if (args.length < 6)   
  621.             throw new IllegalArgumentException(   
  622.                     "Usage:java -u [start url] -d [index dir] -t [threads] [-c]");   
  623.   
  624.         for (int i = 0; i < args.length; i++) {   
  625.             if (args[i].equals("-u"))   
  626.                 worker.setSource(args[++i]);   
  627.             else if (args[i].equals("-d"))   
  628.                 worker.setTarget(args[++i]);   
  629.             else if (args[i].equals("-t"))   
  630.                 worker.setThreads(Integer.parseInt(args[++i]));   
  631.             else if (args[i].equals("-c"))   
  632.                 worker.setMCheck(true);   
  633.         }   
  634.            
  635.         if (worker.getThreads() < 1)   
  636.             throw new IllegalArgumentException("Invalid number of threads: "  
  637.                     + worker.getThreads());   
  638.            
  639.         worker.capture();   
  640.         System.exit(0);   
  641.     }   
  642. }  
package com.huizhi.kanine.util;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.SocketException;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.net.UnknownHostException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;

import jeasy.analysis.MMAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;
import org.htmlparser.Parser;
import org.htmlparser.PrototypicalNodeFactory;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.BaseHrefTag;
import org.htmlparser.tags.FrameTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.MetaTag;
import org.htmlparser.util.EncodingChangeException;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.HtmlPage;

import cpdetector.io.ASCIIDetector;
import cpdetector.io.CodepageDetectorProxy;
import cpdetector.io.JChardetFacade;
import cpdetector.io.ParsingDetector;
import cpdetector.io.UnicodeDetector;


/**
 * @author 张波 
 * E-mail:kaninebruno@hotmail.com 
 * Created On : 2008-03-30
 */
public class SiteCapturer implements Runnable{
	
	/* 基准(初始)URL */
	protected URL mSource;

	/* 索引文件的存放位置 */
	protected String mTarget;

	/**
	 * 待解析的URL地址集合,所有新检测到的链接均存放于此;
	 * 解析时按照先入先出(First-In First-Out)法则线性取出
	 */
	protected ArrayList mPages;

	/* 已解析的URL地址集合,避免链接的重复抓取 */
	protected HashSet mFinished;

	protected Parser mParser;
	
	/* StringBuffer的缓冲区大小 */
	protected  final int TRANSFER_SIZE = 4096;
	
	/* 当前平台的行分隔符 */
	protected static String lineSep = System.getProperty("line.separator");
	
	/* 程序运行线程数,默认2个线程 */
	protected int mthreads;
	
	protected ArrayList threadList;
	
	/* 存储于磁盘的IndexWriter */
	protected IndexWriter FSDWriter;
	
	/* 存储于内存的IndexWriter */
	protected IndexWriter RAMWriter;

	protected IndexSearcher indexSearcher;

	protected RAMDirectory ramDirectory;
	
	/* 筛选页面内容的分词器 */
	protected Analyzer luceneAnalyzer;

	/* 解析页面时的字符编码 */
	protected String charset;
	
	/* 统计已抓取的页面数量 */
	protected int count = 0;
	
	/* 基准端口 */
	protected int mPort;
	
	/* 基准主机 */
	protected String mHost;
	
	/* 检测索引中是否存在当前URL信息,避免重复抓取 */
	protected boolean mCheck;

	/* 索引操作的写入线程锁 */
	public static final Object indexLock = new Object();
	
	public SiteCapturer() {
		mSource = null;
		mTarget = null;
		mthreads = 2;
		mCheck = false;
		mPages = new ArrayList();
		mFinished = new HashSet();
		mParser = new Parser();
		PrototypicalNodeFactory factory = new PrototypicalNodeFactory();
		factory.registerTag(new LocalLinkTag());
		factory.registerTag(new LocalFrameTag());
		factory.registerTag(new LocalBaseHrefTag());
		mParser.setNodeFactory(factory);
	}

	public String getSource() {
		return mSource.toString();
	}

	public void setSource(String source) {
		if (source.endsWith("/"))
			source = source.substring(0, source.length() - 1);
		try {
			mSource = new URL(source);
		} catch (MalformedURLException e) {
			System.err.println("Invalid URL : " + getSource());
		}
	}

	public String getTarget() {
		return (mTarget);
	}

	public void setTarget(String target) {
		mTarget = target;
	}
	
	public int getThreads() {
		return (mthreads);
	}

	public void setThreads(int threads) {
		mthreads = threads;
	}
	
	public boolean isMCheck() {
		return mCheck;
	}

	public void setMCheck(boolean check) {
		mCheck = check;
	}

	/**
	 * 程序入口,在此初始化mPages、IndexWriter
	 * 通过协调各线程间的活动完成website的抓取工作
	 * 任务完成后将所有的索引片段合并为一个以优化检索
	 */
	public void capture(){

		mPages.clear();
		mPages.add(getSource());
		
		int responseCode = 0;
		String contentType = "";
		
		try {
			HttpURLConnection uc = (HttpURLConnection) mSource.openConnection();
			responseCode = uc.getResponseCode();
			contentType = uc.getContentType();
		} catch (MalformedURLException mue) {
			System.err.println("Invalid URL : " + getSource());
		} catch (IOException ie) {
			if (ie instanceof UnknownHostException) {
				System.err.println("UnknowHost : " + getSource());
			} else if (ie instanceof SocketException) {
				System.err.println("Socket Error : " + ie.getMessage() + " "
						+ getSource());
			} else
				ie.printStackTrace();
		}
		
		if (responseCode == HttpURLConnection.HTTP_OK
				&& contentType.startsWith("text/html")) {
			
			mPort = mSource.getPort();
			mHost = mSource.getHost();
			charset = autoDetectCharset(mSource);

			/* 存放索引文件的位置 */
			File indexDir = new File(mTarget);
			/* 标记是否重新建立索引,true为重新建立索引 */
			boolean flag = true;
			if (!indexDir.exists()) {
				/* 如果文件夹不存在则创建 */
				indexDir.mkdir();
			} else if (IndexReader.indexExists(mTarget)) {
				/* 如果已存在索引,则追加索引 */
				flag = false;
				File lockfile = new File(mTarget + File.separator + "write.lock");
				if (lockfile.exists())
					lockfile.delete();
			}
			luceneAnalyzer = new MMAnalyzer();
			ramDirectory = new RAMDirectory();

			try {
				FSDWriter = new IndexWriter(indexDir, luceneAnalyzer, flag);
				RAMWriter = new IndexWriter(ramDirectory, luceneAnalyzer, true);
				
				while (mCheck) {
					IndexReader indexReader = IndexReader.open(mTarget);
					indexSearcher = new IndexSearcher(indexReader);
				}
				
				long start = System.currentTimeMillis();
				threadList = new ArrayList();

				for (int i = 0; i < mthreads; i++) {
					Thread t = new Thread(this, "K-9 Spider Thread #" + (i + 1));
					t.start();
					threadList.add(t);
				}
				while (threadList.size() > 0) {
					Thread child = (Thread) threadList.remove(0);
					try {
						child.join();
					} catch (InterruptedException e) {
						e.printStackTrace();
					}
				}
				long elapsed = System.currentTimeMillis() - start;

				RAMWriter.close();
				FSDWriter.addIndexes(new Directory[] { ramDirectory });
				FSDWriter.optimize();
				FSDWriter.close();

				System.out.println("Finished in " + (elapsed / 1000)
						+ " seconds");
				System.out.println("The Count of the Links Captured is "
						+ count);
			} catch (CorruptIndexException cie) {
				cie.printStackTrace();
			} catch (LockObtainFailedException lofe) {
				lofe.printStackTrace();
			} catch (IOException ie) {
				ie.printStackTrace();
			}
		}	 
	}
	
	public void run() {
		String url;
		while ((url = dequeueURL()) != null) {
			if (isToBeCaptured(url))
				process(url);
		}
		mthreads--;
	}

	/**
	 * 判断提取到的链接是否符合解析条件;标准为Port及Host与基准URL相同且类型为text/html或text/plain
	 */
	public boolean isToBeCaptured (String url){
		boolean flag = false;
		
		HttpURLConnection uc = null;
		int responseCode = 0;
		String contentType = "";
		String host = "";
		int port = 0;
		
		try {
			URL source = new URL(url);
			String protocol = source.getProtocol();
			if (protocol != null && protocol.equals("http")) {
				host = source.getHost();
				port = source.getPort();
				uc = (HttpURLConnection) source.openConnection();
				uc.setConnectTimeout(8000);
				responseCode = uc.getResponseCode();
				contentType = uc.getContentType();
			}
		} catch (MalformedURLException mue) {
			System.err.println("Invalid URL : " + url);
		} catch (IOException ie) {
			if (ie instanceof UnknownHostException) {
				System.err.println("UnknowHost : " + url);
			} else if (ie instanceof SocketException) {
				System.err.println("Socket Error : " + ie.getMessage() + " "
						+ url);
			} else if (ie instanceof SocketTimeoutException) {
				System.err.println("Socket Connection Time Out : " + url);
			} else if (ie instanceof FileNotFoundException) {
				System.err.println("broken link "
						+ ((FileNotFoundException) ie.getCause()).getMessage()
						+ " ignored");
			} else
				ie.printStackTrace();
		}
		
		if (port == mPort
				&& responseCode == HttpURLConnection.HTTP_OK
				&& host.equals(mHost)
				&& (contentType.startsWith("text/html") || contentType
						.startsWith("text/plain")))
			flag = true;
		return flag;
	}

	/* 从URL队列mPages里取出单个的URL */
	public synchronized String dequeueURL() {
		while (true) {
			if (mPages.size() > 0) {
				String url = (String) mPages.remove(0);
				mFinished.add(url);
				
				if (isToBeCaptured(url)) {
					int bookmark;
					NodeList list;
					NodeList robots;
					MetaTag robot;
					String content;
					try {
						bookmark = mPages.size();
						/* 获取页面所有节点 */
						mParser.setURL(url);
						try {
							list = new NodeList();
							for (NodeIterator e = mParser.elements(); e
									.hasMoreNodes();)
								list.add(e.nextNode());
						} catch (EncodingChangeException ece) {
							/* 解码出错的异常处理 */
							mParser.reset();
							list = new NodeList();
							for (NodeIterator e = mParser.elements(); e
									.hasMoreNodes();)
								list.add(e.nextNode());
						}
						/**
						 * 依据 http://www.robotstxt.org/wc/meta-user.html 处理
						 * Robots  tag
						 */
						robots = list
								.extractAllNodesThatMatch(
										new AndFilter(new NodeClassFilter(
												MetaTag.class),
												new HasAttributeFilter("name",
														"robots")), true);
						if (0 != robots.size()) {
							robot = (MetaTag) robots.elementAt(0);
							content = robot.getAttribute("content")
									.toLowerCase();
							if ((-1 != content.indexOf("none"))
									|| (-1 != content.indexOf("nofollow")))
								for (int i = bookmark; i < mPages.size(); i++)
									mPages.remove(i);
						}
					} catch (ParserException pe) {
						pe.printStackTrace();
					}
				}
				return url;
			} else {
				mthreads--;
				if (mthreads > 0) {
					try {
						wait();
						mthreads++;
					} catch (InterruptedException ie) {
						ie.printStackTrace();
					}
				} else {
					notifyAll();
					return null;
				}
			}
		}
	}

	/**
	 * 处理单独的URL地址,解析页面并加入到lucene索引中;通过自动探测页面编码保证抓取工作的顺利执行
	 */
	protected void process(String url) {
		
		String result[];
		String content = null;
		String title = null;

		/* 此项操作较耗性能,故默认不予检测 */
		if (mCheck) {
			try {
				TermQuery query = new TermQuery(new Term("url", url));
				Hits hits = indexSearcher.search(query);
				if (hits.length() > 0) {
					System.out.println("The URL : " + url
							+ " has already been captured");
				} else {
					result = parseHtml(url, charset);
					content = result[0];
					title = result[1];
				}
			} catch (IOException ie) {
				ie.printStackTrace();
			}
		} else {
			result = parseHtml(url, charset);
			content = result[0];
			title = result[1];
		}
		
		if (content != null && content.trim().length() > 0) {

			Document document = new Document();
			document.add(new Field("content", content, Field.Store.YES,
					Field.Index.TOKENIZED,
					Field.TermVector.WITH_POSITIONS_OFFSETS));
			document.add(new Field("url", url, Field.Store.YES,
					Field.Index.UN_TOKENIZED));
			document.add(new Field("title", title, Field.Store.YES,
					Field.Index.TOKENIZED,
					Field.TermVector.WITH_POSITIONS_OFFSETS));
			document.add(new Field("date", DateTools.timeToString(new Date()
					.getTime(), DateTools.Resolution.DAY), Field.Store.YES,
					Field.Index.UN_TOKENIZED));
			
			synchronized (indexLock) {
				try {
					RAMWriter.addDocument(document);
					/**
					 * 当存放索引的内存使用大于指定值时将其写入硬盘;采用此方法的目的是
                     * 通过内存缓冲避免频繁的IO操作,提高索引创建性能;
                     */
					if (RAMWriter.ramSizeInBytes() > 512 * 1024) {
						RAMWriter.close();
						FSDWriter.addIndexes(new Directory[] { ramDirectory });
						RAMWriter = new IndexWriter(ramDirectory,
								luceneAnalyzer, true);
					}
					count++;
					System.out.println(Thread.currentThread().getName()
							+ ": Finished Indexing URL: " + url);
				} catch (CorruptIndexException cie) {
					cie.printStackTrace();
				} catch (IOException ie) {
					ie.printStackTrace();
				}
			}
		}
	}

	/**
	 * Link tag that rewrites the HREF.
	 * The HREF is changed to a local target if it matches the source.
	 */
	class LocalLinkTag extends LinkTag {
		public void doSemanticAction() {

			String link = getLink();
			if (link.endsWith("/"))
				link = link.substring(0, link.length() - 1);
			int pos = link.indexOf("#");
			if (pos != -1)
				link = link.substring(0, pos);

			/* 将链接加入到处理队列中 */
			if (!(mFinished.contains(link) || mPages.contains(link)))
				mPages.add(link);

			setLink(link);
		}
	}

	/**
	 * Frame tag that rewrites the SRC URLs. The SRC URLs are mapped to local
	 * targets if they match the source.
	 */
	class LocalFrameTag extends FrameTag {
		public void doSemanticAction() {

			String link = getFrameLocation();
			if (link.endsWith("/"))
				link = link.substring(0, link.length() - 1);
			int pos = link.indexOf("#");
			if (pos != -1)
				link = link.substring(0, pos);

			/* 将链接加入到处理队列中 */
			if (!(mFinished.contains(link) || mPages.contains(link)))
				mPages.add(link);

			setFrameLocation(link);
		}
	}

	/**
	 * Base tag that doesn't show. The toHtml() method is overridden to return
	 * an empty string, effectively shutting off the base reference.
	 */
	class LocalBaseHrefTag extends BaseHrefTag {
		
		public String toHtml() {
			return ("");
		}
	}
	
	/* 自动探测页面编码,避免中文乱码的出现 */
	protected String autoDetectCharset(URL url) {
		
		CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
		/**
		 * ParsingDetector可用于检查HTML、XML等文件或字符流的编码
		 * 构造方法中的参数用于指示是否显示探测过程的详细信息
		 * 为false则不显示
		 */ 
		detector.add(new ParsingDetector(false));
		detector.add(JChardetFacade.getInstance());
		detector.add(ASCIIDetector.getInstance());
		detector.add(UnicodeDetector.getInstance());
		
		Charset charset = null;
		try {
			charset = detector.detectCodepage(url);
		} catch (MalformedURLException mue) {
			mue.printStackTrace();
		} catch (IOException ie) {
			ie.printStackTrace();
		}
		if (charset == null)
			charset = Charset.defaultCharset();
		return charset.name();
	}

	/* 按照指定编码解析标准的html页面,为建立索引做准备*/
	protected String[] parseHtml(String url, String charset) {

		String result[] = null;
		String content = null;
		
		try {
			URL source = new URL(url);
			InputStream in = source.openStream();
			BufferedReader reader = new BufferedReader(new InputStreamReader(
					in, charset));
			String line = new String();
			StringBuffer temp = new StringBuffer(TRANSFER_SIZE);
			while ((line = reader.readLine()) != null) {
				temp.append(line);
				temp.append(lineSep);
			}
			reader.close();
			in.close();
			content = temp.toString();
		} catch (MalformedURLException mue) {
			System.err.println("Invalid URL : " + url);
		} catch (UnsupportedEncodingException uee) {
			uee.printStackTrace();
		} catch (IOException ie) {
			if (ie instanceof UnknownHostException) {
				System.err.println("UnknowHost : " + url);
			} else if (ie instanceof SocketException) {
				System.err.println("Socket Error : " + ie.getMessage() + " "
						+ url);
			} else if (ie instanceof SocketTimeoutException) {
				System.err.println("Socket Connection Time Out : " + url);
			} else
				ie.printStackTrace();
		}

		if (content != null) {
			Parser myParser = Parser.createParser(content, charset);
			HtmlPage visitor = new HtmlPage(myParser);
			try {
				myParser.visitAllNodesWith(visitor);
				String body = null;
				String title = "Untitled";
				if (visitor.getBody() != null) {
					NodeList nodelist = visitor.getBody();
					body = nodelist.asString().trim();
				}
				if (visitor.getTitle() != null)
					title = visitor.getTitle();
				result = new String[] { body, title };
			} catch (ParserException pe) {
				pe.printStackTrace();
			}
		}
		return result;
	}
	
	public static void main(String[] args) {
		SiteCapturer worker = new SiteCapturer();
		
		if (args.length < 6)
			throw new IllegalArgumentException(
					"Usage:java -u [start url] -d [index dir] -t [threads] [-c]");

		for (int i = 0; i < args.length; i++) {
			if (args[i].equals("-u"))
				worker.setSource(args[++i]);
			else if (args[i].equals("-d"))
				worker.setTarget(args[++i]);
			else if (args[i].equals("-t"))
				worker.setThreads(Integer.parseInt(args[++i]));
			else if (args[i].equals("-c"))
				worker.setMCheck(true);
		}
		
		if (worker.getThreads() < 1)
			throw new IllegalArgumentException("Invalid number of threads: "
					+ worker.getThreads());
		
		worker.capture();
		System.exit(0);
	}
}


程序运行可选择控制台或新建一JSP页面,加入以下代码即可

Java代码 复制代码
  1. <%@ page contentType="text/html; charset=UTF-8"%>   
  2. <%@ page import="com.huizhi.kanine.util.*"%>   
  3. <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">   
  4.   
  5. <html>   
  6. <head>   
  7. <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">   
  8. <title>Lucene</title>   
  9. </head>   
  10. <body>   
  11. <%   
  12.     SiteCapturer worker = new SiteCapturer();   
  13.     worker.setSource ("http://www.blabla.cn/");   
  14.     worker.setTarget("c://luceneIndexes");   
  15.     worker.setThreads(20);   
  16.     worker.capture();   
  17. %>   
  18. </body>   
  19. </html>  
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值