网络爬虫(源代码参考)

package com.heaton.bot;
import com.heaton.bot.*;
import java.net.*;

/**
* The SpiderWorker class performs the actual work of
* spidering pages. It is implemented as a thread
* that is created by the spider class.
*
* Copyright 2001-2003 by Jeff Heaton (http://www.jeffheaton.com)
*
* @author Jeff Heaton
* @version 1.2
*/
public class SpiderWorker extends Thread {

/**
* The URL that this spider worker
* should be downloading.
*/
protected String target;

/**
* The owner of this spider worker class,
* should always be a Spider object.
* This is the class that this spider
* worker will send its data to.
*/
protected Spider owner;

/**
* Indicates if the spider is busy or not.
* true = busy
* false = idle
*/
protected boolean busy;

/**
* A descendant of the HTTP object that
* this class should be using for HTTP
* communication. This is usually the
* HTTPSocket class.
*/
protected HTTP http;

/**
* Constructs a spider worker object.
*
* @param owner The owner of this object, usually
* a Spider object.
* @param http
*/
public SpiderWorker(Spider owner,HTTP http)
{
this.http = http;
this.owner = owner;
}

/**
* Returns true of false to indicate if
* the spider is busy or idle.
*
* @return true = busy
* false = idle
*/
public boolean isBusy()

/SPAN>
return this.busy;
}

/**
* The run method causes this thread to go idle
* and wait for a workload. Once a workload is
* received, the processWorkload method is called
* to handle the workload.
*/
public void run()
{
for ( ;; ) {
target = this.owner.getWorkload();
if ( target==null )
return;
owner.getSpiderDone().workerBegin();
processWorkload();
owner.getSpiderDone().workerEnd();
}
}

/**
* The run method actually performs the
* the workload assigned to this object.
*/
public void processWorkload()
{
try {
busy = true;
Log.log(Log.LOG_LEVEL_NORMAL,"Spidering " + target );
http.send(target,null);
Attribute typeAttribute = http.getServerHeaders().get("Content-Type");

// if no content-type at all, its PROBABLY not HTML
if ( typeAttribute==null )
return;

// now check to see if is HTML, ONLY PARSE text type files(namely HTML)
owner.processPage(http);
if ( !typeAttribute.getValue().startsWith("text/") )
return;

HTMLParser parse = new HTMLParser();

parse.source = new StringBuffer(http.getBody());
// find all the links
while ( !parse.eof() ) {
char ch = parse.get();
if ( ch==0 ) {
HTMLTag tag = parse.getTag();
Attribute link = tag.get("HREF");
if ( link==null )
link = tag.get("SRC");

if ( link==null )
continue;

URL target=null;


E: 7.5pt"> try {
target = new URL(new URL(this.target),link.getValue());
} catch ( MalformedURLException e ) {
Log.log(Log.LOG_LEVEL_TRACE,
"Spider found other link: " + link );
owner.foundOtherLink(link.getValue());
continue;
}

if ( owner.getRemoveQuery() )
target = URLUtility.stripQuery(target);
target = URLUtility.stripAnhcor(target);


if ( target.getHost().equalsIgnoreCase(
new URL(this.target).getHost()) ) {
Log.log(Log.LOG_LEVEL_NORMAL,
"Spider found internal link: " + target.toString() );
owner.foundInternalLink(target.toString());
} else {
Log.log(Log.LOG_LEVEL_NORMAL,
"Spider found external link: " + target.toString() );
owner.foundExternalLink(target.toString());
}
}
}
owner.completePage(http,false);
} catch ( java.io.IOException e ) {
Log.log(Log.LOG_LEVEL_ERROR,
"Error loading file("+ target +"): " + e );
owner.completePage(http,true);
} catch ( Exception e ) {
Log.logException(
"Exception while processing file("+ target +"): ", e );
owner.completePage(http,true);
} finally {
busy = false;
}
}

/**
* Returns the HTTP descendant that this
* object should use for all HTTP communication.
*
* @return An HTTP descendant object.<

/FONT>
*/
public HTTP getHTTP()
{
return http;
}
}

文章出处:http://www.diybl.com/course/3_program/java/javajs/200797/69988_4.html

文章出处:http://www.diybl.com/course/3_program/java/javajs/200797/69988_3.html
文章出处:http://www.diybl.com/course/3_program/java/javajs/200797/69988_2.html

文章出处:http://www.diybl.com/course/3_program/java/javajs/200797/69988.html
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值