JSpider源码研究:核心SpiderHttpURLTask

呵呵,核心代码,挺详细的,考虑比较周到,隐士观察下来,还缺少设定timeout,还缺proxy部分,因为JSpider已经支持proxy,不过是用java内置的方法,结果就是同一时间只有一个proxy起作用。 
package  net.javacoding.jspider.core.task.work;


import  net.javacoding.jspider.api.model.HTTPHeader;
import  net.javacoding.jspider.api.model.Site;
import  net.javacoding.jspider.core.SpiderContext;
import  net.javacoding.jspider.core.logging.LogFactory;
import  net.javacoding.jspider.core.event.CoreEvent;
import  net.javacoding.jspider.core.event.impl. * ;
import  net.javacoding.jspider.core.task.WorkerTask;
import  net.javacoding.jspider.core.util.http.HTTPHeaderUtil;
import  net.javacoding.jspider.core.util.URLUtil;

import  java.io. * ;
import  java.net. * ;


/**
 *
 * $Id: SpiderHttpURLTask.java,v 1.19 2003/04/10 16:19:14 vanrogu Exp $
 *
 * 
@author G黱ther Van Roey
 
*/

public   class  SpiderHttpURLTask  extends  BaseWorkerTaskImpl  {

    
protected URL url;
    
protected Site site;


    
public SpiderHttpURLTask(SpiderContext context, URL url, Site site) {
        
super(context, WorkerTask.WORKERTASK_SPIDERTASK);
        
this.url = url;
        
this.site = site;
    }


    
public void prepare() {
        context.throttle(site);
    }


    
public void execute() {

        CoreEvent event 
= null;
        URLConnection connection 
= null;

        InputStream inputStream 
= null;

        
int httpStatus = 0;
        HTTPHeader[] headers 
= null;

        
try {

            connection 
= url.openConnection();

            
if (connection instanceof HttpURLConnection) {
                ((HttpURLConnection) connection).setInstanceFollowRedirects(
false);
            }


            connection.setRequestProperty(
"User-agent", site.getUserAgent());
            context.preHandle(connection, site);

            
long start = System.currentTimeMillis();
            connection.connect();

            
if (connection instanceof HttpURLConnection) {
                httpStatus 
= ((HttpURLConnection) connection).getResponseCode();
                
switch (httpStatus) {
                    
case HttpURLConnection.HTTP_MOVED_PERM:
                    
case HttpURLConnection.HTTP_MOVED_TEMP:
                        String redirectURL 
= connection.getHeaderField("location");
                        notifyEvent(url, 
new URLFoundEvent(context, url, URLUtil.normalize(new URL(redirectURL))));
                        
break;
                    
default:
                        
break;
                }

            }

            inputStream 
= new BufferedInputStream(connection.getInputStream());

            ByteArrayOutputStream os 
= new ByteArrayOutputStream();
            InputStream is 
= new BufferedInputStream(inputStream);
            
//int size = connection.getContentLength();
            int size = 0;
            
try {
                    
int i = is.read();
                    
while (i != -1{
                        size
++;
                        os.write(i);
                        i 
= is.read();
                    }

            }
 catch (IOException e) {
                LogFactory.getLog(SpiderHttpURLTask.
class).error("i/o exception during fetch",e);
            }


            String contentType 
= connection.getContentType();
            
int timeMs = (int) (System.currentTimeMillis() - start);

            headers 
= HTTPHeaderUtil.getHeaders(connection);

            
if (httpStatus >= 200 && httpStatus < 303{
                event 
= new URLSpideredOkEvent(context, url, httpStatus, connection, contentType, timeMs, size, os.toByteArray(), headers);
            }
 else {
                event 
= new URLSpideredErrorEvent(context, url, httpStatus, connection, headers, null);
            }


            context.postHandle(connection, site);

        }
 catch (FileNotFoundException e) {
            headers 
= HTTPHeaderUtil.getHeaders(connection);
            event 
= new URLSpideredErrorEvent(context, url, 404, connection, headers, e);
        }
 catch (Exception e) {
            LogFactory.getLog(
this.getClass()).error("exception during spidering", e);
            event 
= new URLSpideredErrorEvent(context, url, httpStatus, connection, headers, e);
        }
 finally {
            notifyEvent(url, event);
            
if (inputStream != null{
                
try {
                    inputStream.close();
                }
 catch (IOException e) {
                    LogFactory.getLog(SpiderHttpURLTask.
class).error("i/o exception closing inputstream",e);
                }

            }

        }

    }


}

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值