【多线程】网络爬虫源码

声明:转载的,其中多线程的思路是非常好的参考。

涉及的Jar包:


----------------------------------------------------------------------

1.Spider

package com.demo.spider;


/**
 * 主类 协调其他各类共同完成任务
 * @author Administrator
 *
 */

public class Spider implements Runnable {
	
	private final int  POOL_SIZE = 3;
	protected SpiderWorker[] pool;
	protected static SpiderWatcher watch = null;
	
	
	private Spider(String keyword, String startsite){
		pool = new SpiderWorker[POOL_SIZE];
		
		for(int i=0; i<pool.length; i++){
			pool[i] = new SpiderWorker(keyword, startsite);
		}
	}
	
	public void run(){
		System.out.println("this is running.....");
		for(int i=0; i < pool.length; i++){
			this.pool[i].start();
			try{
				this.watch.waitBegin();
				this.watch.waitDone();
				
				for (int j = 0; j < this.pool.length; j++) {
					this.pool[j].interrupt();
					this.pool[j].join();
					this.pool[j] = null;
				}
			}catch(Exception e){
				System.out.println("线程启动异常-->" + e);
			}
		}
		
		System.out.println("线程结束。");
		
	}
	
	
	
	public static void main(String[] args){
		Spider sp = new Spider("英超", "http://sport.sina.com.cn");
		System.out.println("Begin ....");
		//看护线程
		watch = new SpiderWatcher();
		watch.setDaemon(true);
		watch.start();
		
		//主线程
		try{
			
			Thread search = new Thread(sp);
			search.start(); 
			
		}catch(Exception e){
			e.printStackTrace();
		}
		
		
		System.out.println("game over!!!");
	}
}

2.SpiderWatcher

package com.demo.spider;
/**
 * 守护线程
 * 监视个线程的运行情况
 * @author Administrator
 *
 */
public class SpiderWatcher extends Thread{
	private int activeThreads = 0;
	private boolean started = false;
	
	public void run(){
		while(true){
			System.out.println("当前线程数=========》" + this.activeThreads);
			
			try{
				sleep(5000);
			}catch(InterruptedException e){
				e.printStackTrace();
			}
		}
		
	}
	
	public synchronized void waitDone() {
		try {
			while (this.activeThreads > 0)
				wait();
		} catch (InterruptedException e) {
		}
	}
	
	public synchronized void waitBegin() {
		try {
			while (!this.started)
				wait();
		} catch (InterruptedException e) {
		}
	}
	
	public synchronized void workerBegin() {
		this.activeThreads += 1;
		this.started = true;
		notify();
	}
	
	public synchronized void workerEnd() {
		this.activeThreads -= 1;
		notify();
	}

	public synchronized void reset() {
		this.activeThreads = 0;
	}
	

}

3.SpiderWorker

package com.demo.spider;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.AttributedCharacterIterator.Attribute;

import com.heaton.bot.HTMLParser;
import com.heaton.bot.HTMLTag;
import com.heaton.bot.HTTP;
import com.heaton.bot.Log;
import com.heaton.bot.URLUtility;

public class SpiderWorker extends Thread
{
  protected String target;
  protected Spider owner;
  protected boolean busy;
  protected HTTP http;

  public SpiderWorker(Spider owner, HTTP http)
  {
    this.http = http;
    this.owner = owner;
  }

  public boolean isBusy()
  {
    return this.busy;
  }

  public void run()
  {
    while (true)
    {
      this.target = this.owner.getWorkload();
      if (this.target == null)
        return;
      this.owner.getSpiderDone().workerBegin();
      processWorkload();
      this.owner.getSpiderDone().workerEnd();
    }
  }

  public void processWorkload()
  {
    try
    {
      this.busy = true;
      Log.log(3, "Spidering " + this.target);
      this.http.send(this.target, null);
      Attribute typeAttribute = this.http.getServerHeaders().get("Content-Type");

      if (typeAttribute == null) {
        return;
      }
      this.owner.processPage(this.http);
      if (!typeAttribute.getValue().startsWith("text/"))
        return;
      HTMLParser parse = new HTMLParser();

      parse.source = new StringBuffer(this.http.getBody());

      while (!parse.eof()) {
        char ch = parse.get();
        if (ch == 0) {
          HTMLTag tag = parse.getTag();
          Attribute link = tag.get("HREF");
          if (link == null) {
            link = tag.get("SRC");
          }
          if (link == null) {
            continue;
          }
          URL target = null;
          try {
            target = new URL(new URL(this.target), link.getValue());
          } catch (MalformedURLException e) {
            Log.log(2, "Spider found other link: " + link);

            this.owner.foundOtherLink(link.getValue());
          }continue;

          if (this.owner.getRemoveQuery())
            target = URLUtility.stripQuery(target);
          target = URLUtility.stripAnhcor(target);

          if (target.getHost().equalsIgnoreCase(new URL(this.target).getHost()))
          {
            Log.log(3, "Spider found internal link: " + target.toString());

            this.owner.foundInternalLink(target.toString());
          } else {
            Log.log(3, "Spider found external link: " + target.toString());

            this.owner.foundExternalLink(target.toString());
          }
        }
      }
      this.owner.completePage(this.http, false);
    } catch (IOException e) {
      Log.log(4, "Error loading file(" + this.target + "): " + e);

      this.owner.completePage(this.http, true);
    } catch (Exception e) {
      Log.logException("Exception while processing file(" + this.target + "): ", e);

      this.owner.completePage(this.http, true);
    } finally {
      this.busy = false;
    }
  }

  public HTTP getHTTP()
  {
    return this.http;
  }
}

4.SearchResultForm

package com.demo.spider;

public class SearchResultForm {
	private String url = "";
	private String title = "";
	private String keywords = "";
	private int count_key_words = 0;
	
	public String getUrl() {
		return url;
	}
	public void setUrl(String url) {
		this.url = url;
	}
	public String getTitle() {
		return title;
	}
	public void setTitle(String title) {
		this.title = title;
	}
	public String getKeywords() {
		return keywords;
	}
	public void setKeywords(String keywords) {
		this.keywords = keywords;
	}
	public int getCount_key_words() {
		return count_key_words;
	}
	public void setCount_key_words(int count_key_words) {
		this.count_key_words = count_key_words;
	}
	
	
	

}

5.UrlManager

package com.demo.spider;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;

/**
 * 管理url 四种队列
 * 等候队列
 * 运行队列
 * 完成队列
 * 错误队列
 * @author Administrator
 *
 */
public class UrlManager {
	public List resultlist = null;  //搜索到关键字链接列表
	public List searchedsite = null;  //已经被搜索站点列表
	public Queue linklist = null;  //需解析的 等待队列
	
	
	
	
	
	//不允许爬虫访问的站点
	HashMap<String, ArrayList<String>> disallowListCache = null;
	
	public UrlManager(){
		resultlist = new ArrayList();
		searchedsite = new ArrayList();
		linklist = new LinkedList();
		disallowListCache = new HashMap<String, ArrayList<String>>();
	}
	
	
	
	
	
	
	
	

}


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值