用httpunit写的spider程序:可以监测网站的错误页面!

这个程序出自Java Tools for Extreme Programming一书。

import com.meterware.httpunit.*;
import java.util.HashSet;
import java.util.Set;

public class CheckSite {

 private WebConversation conversation;

 private Set checkedLinks;

 private String host = "www.sohu.com";

 public static void main(String[] args) throws Exception {
  CheckSite cs = new CheckSite();
  cs.setUp();
  cs.testEntireSite();
 }

 public void setUp() {
  conversation = new WebConversation();
  checkedLinks = new HashSet();
 }

 public void testEntireSite() throws Exception {
  WebResponse response = conversation.getResponse("http://" + host);
  checkAllLinks(response);
  System.out.println("Site check finished. Link's checked: "
    + checkedLinks.size() + " : " + checkedLinks);
 }

 private void checkAllLinks(WebResponse response) throws Exception {
  if (!isHtml(response)) {
   return;
  }
  WebLink[] links = response.getLinks();
  System.out.println(response.getTitle() + " -- links found = "
    + links.length);
  for (int i = 0; i < links.length; i++) {
   boolean newLink = checkedLinks.add(links[i].getURLString());
   if (newLink) {
    System.out.println("Total links checked so far: "
      + checkedLinks.size());
    checkLink(links[i]);
   }
  }
 }

 private boolean isHtml(WebResponse response) {
  return response.getContentType().equals("text/html");
 }

 private void checkLink(WebLink link) throws Exception {
  WebRequest request = link.getRequest();
  java.net.URL url = request.getURL();
  System.out.println("checking link: " + url);
  String linkHost = url.getHost();
  if (linkHost.equals(this.host)) {
   WebResponse response = conversation.getResponse(request);
   this.checkAllLinks(response);
  }

 }
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值