声明:转载的,其中多线程的思路是非常好的参考。
涉及的Jar包:
----------------------------------------------------------------------
1.Spider
package com.demo.spider;
/**
* 主类 协调其他各类共同完成任务
* @author Administrator
*
*/
public class Spider implements Runnable {
private final int POOL_SIZE = 3;
protected SpiderWorker[] pool;
protected static SpiderWatcher watch = null;
private Spider(String keyword, String startsite){
pool = new SpiderWorker[POOL_SIZE];
for(int i=0; i<pool.length; i++){
pool[i] = new SpiderWorker(keyword, startsite);
}
}
public void run(){
System.out.println("this is running.....");
for(int i=0; i < pool.length; i++){
this.pool[i].start();
try{
this.watch.waitBegin();
this.watch.waitDone();
for (int j = 0; j < this.pool.length; j++) {
this.pool[j].interrupt();
this.pool[j].join();
this.pool[j] = null;
}
}catch(Exception e){
System.out.println("线程启动异常-->" + e);
}
}
System.out.println("线程结束。");
}
public static void main(String[] args){
Spider sp = new Spider("英超", "http://sport.sina.com.cn");
System.out.println("Begin ....");
//看护线程
watch = new SpiderWatcher();
watch.setDaemon(true);
watch.start();
//主线程
try{
Thread search = new Thread(sp);
search.start();
}catch(Exception e){
e.printStackTrace();
}
System.out.println("game over!!!");
}
}
2.SpiderWatcher
package com.demo.spider;
/**
* 守护线程
* 监视个线程的运行情况
* @author Administrator
*
*/
public class SpiderWatcher extends Thread{
private int activeThreads = 0;
private boolean started = false;
public void run(){
while(true){
System.out.println("当前线程数=========》" + this.activeThreads);
try{
sleep(5000);
}catch(InterruptedException e){
e.printStackTrace();
}
}
}
public synchronized void waitDone() {
try {
while (this.activeThreads > 0)
wait();
} catch (InterruptedException e) {
}
}
public synchronized void waitBegin() {
try {
while (!this.started)
wait();
} catch (InterruptedException e) {
}
}
public synchronized void workerBegin() {
this.activeThreads += 1;
this.started = true;
notify();
}
public synchronized void workerEnd() {
this.activeThreads -= 1;
notify();
}
public synchronized void reset() {
this.activeThreads = 0;
}
}
3.SpiderWorker
package com.demo.spider;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.AttributedCharacterIterator.Attribute;
import com.heaton.bot.HTMLParser;
import com.heaton.bot.HTMLTag;
import com.heaton.bot.HTTP;
import com.heaton.bot.Log;
import com.heaton.bot.URLUtility;
public class SpiderWorker extends Thread
{
protected String target;
protected Spider owner;
protected boolean busy;
protected HTTP http;
public SpiderWorker(Spider owner, HTTP http)
{
this.http = http;
this.owner = owner;
}
public boolean isBusy()
{
return this.busy;
}
public void run()
{
while (true)
{
this.target = this.owner.getWorkload();
if (this.target == null)
return;
this.owner.getSpiderDone().workerBegin();
processWorkload();
this.owner.getSpiderDone().workerEnd();
}
}
public void processWorkload()
{
try
{
this.busy = true;
Log.log(3, "Spidering " + this.target);
this.http.send(this.target, null);
Attribute typeAttribute = this.http.getServerHeaders().get("Content-Type");
if (typeAttribute == null) {
return;
}
this.owner.processPage(this.http);
if (!typeAttribute.getValue().startsWith("text/"))
return;
HTMLParser parse = new HTMLParser();
parse.source = new StringBuffer(this.http.getBody());
while (!parse.eof()) {
char ch = parse.get();
if (ch == 0) {
HTMLTag tag = parse.getTag();
Attribute link = tag.get("HREF");
if (link == null) {
link = tag.get("SRC");
}
if (link == null) {
continue;
}
URL target = null;
try {
target = new URL(new URL(this.target), link.getValue());
} catch (MalformedURLException e) {
Log.log(2, "Spider found other link: " + link);
this.owner.foundOtherLink(link.getValue());
}continue;
if (this.owner.getRemoveQuery())
target = URLUtility.stripQuery(target);
target = URLUtility.stripAnhcor(target);
if (target.getHost().equalsIgnoreCase(new URL(this.target).getHost()))
{
Log.log(3, "Spider found internal link: " + target.toString());
this.owner.foundInternalLink(target.toString());
} else {
Log.log(3, "Spider found external link: " + target.toString());
this.owner.foundExternalLink(target.toString());
}
}
}
this.owner.completePage(this.http, false);
} catch (IOException e) {
Log.log(4, "Error loading file(" + this.target + "): " + e);
this.owner.completePage(this.http, true);
} catch (Exception e) {
Log.logException("Exception while processing file(" + this.target + "): ", e);
this.owner.completePage(this.http, true);
} finally {
this.busy = false;
}
}
public HTTP getHTTP()
{
return this.http;
}
}
4.SearchResultForm
package com.demo.spider;
public class SearchResultForm {
private String url = "";
private String title = "";
private String keywords = "";
private int count_key_words = 0;
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getKeywords() {
return keywords;
}
public void setKeywords(String keywords) {
this.keywords = keywords;
}
public int getCount_key_words() {
return count_key_words;
}
public void setCount_key_words(int count_key_words) {
this.count_key_words = count_key_words;
}
}
5.UrlManager
package com.demo.spider;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
/**
* 管理url 四种队列
* 等候队列
* 运行队列
* 完成队列
* 错误队列
* @author Administrator
*
*/
public class UrlManager {
public List resultlist = null; //搜索到关键字链接列表
public List searchedsite = null; //已经被搜索站点列表
public Queue linklist = null; //需解析的 等待队列
//不允许爬虫访问的站点
HashMap<String, ArrayList<String>> disallowListCache = null;
public UrlManager(){
resultlist = new ArrayList();
searchedsite = new ArrayList();
linklist = new LinkedList();
disallowListCache = new HashMap<String, ArrayList<String>>();
}
}