呵呵,核心代码,挺详细的,考虑比较周到,隐士观察下来,还缺少设定timeout,还缺proxy部分,因为JSpider已经支持proxy,不过是用java内置的方法,结果就是同一时间只有一个proxy起作用。
package
net.javacoding.jspider.core.task.work;
import net.javacoding.jspider.api.model.HTTPHeader;
import net.javacoding.jspider.api.model.Site;
import net.javacoding.jspider.core.SpiderContext;
import net.javacoding.jspider.core.logging.LogFactory;
import net.javacoding.jspider.core.event.CoreEvent;
import net.javacoding.jspider.core.event.impl. * ;
import net.javacoding.jspider.core.task.WorkerTask;
import net.javacoding.jspider.core.util.http.HTTPHeaderUtil;
import net.javacoding.jspider.core.util.URLUtil;
import java.io. * ;
import java.net. * ;
/** */ /**
*
* $Id: SpiderHttpURLTask.java,v 1.19 2003/04/10 16:19:14 vanrogu Exp $
*
* @author G黱ther Van Roey
*/
public class SpiderHttpURLTask extends BaseWorkerTaskImpl ... {
protected URL url;
protected Site site;
public SpiderHttpURLTask(SpiderContext context, URL url, Site site) ...{
super(context, WorkerTask.WORKERTASK_SPIDERTASK);
this.url = url;
this.site = site;
}
public void prepare() ...{
context.throttle(site);
}
public void execute() ...{
CoreEvent event = null;
URLConnection connection = null;
InputStream inputStream = null;
int httpStatus = 0;
HTTPHeader[] headers = null;
try ...{
connection = url.openConnection();
if (connection instanceof HttpURLConnection) ...{
((HttpURLConnection) connection).setInstanceFollowRedirects(false);
}
connection.setRequestProperty("User-agent", site.getUserAgent());
context.preHandle(connection, site);
long start = System.currentTimeMillis();
connection.connect();
if (connection instanceof HttpURLConnection) ...{
httpStatus = ((HttpURLConnection) connection).getResponseCode();
switch (httpStatus) ...{
case HttpURLConnection.HTTP_MOVED_PERM:
case HttpURLConnection.HTTP_MOVED_TEMP:
String redirectURL = connection.getHeaderField("location");
notifyEvent(url, new URLFoundEvent(context, url, URLUtil.normalize(new URL(redirectURL))));
break;
default:
break;
}
}
inputStream = new BufferedInputStream(connection.getInputStream());
ByteArrayOutputStream os = new ByteArrayOutputStream();
InputStream is = new BufferedInputStream(inputStream);
//int size = connection.getContentLength();
int size = 0;
try ...{
int i = is.read();
while (i != -1) ...{
size++;
os.write(i);
i = is.read();
}
} catch (IOException e) ...{
LogFactory.getLog(SpiderHttpURLTask.class).error("i/o exception during fetch",e);
}
String contentType = connection.getContentType();
int timeMs = (int) (System.currentTimeMillis() - start);
headers = HTTPHeaderUtil.getHeaders(connection);
if (httpStatus >= 200 && httpStatus < 303) ...{
event = new URLSpideredOkEvent(context, url, httpStatus, connection, contentType, timeMs, size, os.toByteArray(), headers);
} else ...{
event = new URLSpideredErrorEvent(context, url, httpStatus, connection, headers, null);
}
context.postHandle(connection, site);
} catch (FileNotFoundException e) ...{
headers = HTTPHeaderUtil.getHeaders(connection);
event = new URLSpideredErrorEvent(context, url, 404, connection, headers, e);
} catch (Exception e) ...{
LogFactory.getLog(this.getClass()).error("exception during spidering", e);
event = new URLSpideredErrorEvent(context, url, httpStatus, connection, headers, e);
} finally ...{
notifyEvent(url, event);
if (inputStream != null) ...{
try ...{
inputStream.close();
} catch (IOException e) ...{
LogFactory.getLog(SpiderHttpURLTask.class).error("i/o exception closing inputstream",e);
}
}
}
}
}
import net.javacoding.jspider.api.model.HTTPHeader;
import net.javacoding.jspider.api.model.Site;
import net.javacoding.jspider.core.SpiderContext;
import net.javacoding.jspider.core.logging.LogFactory;
import net.javacoding.jspider.core.event.CoreEvent;
import net.javacoding.jspider.core.event.impl. * ;
import net.javacoding.jspider.core.task.WorkerTask;
import net.javacoding.jspider.core.util.http.HTTPHeaderUtil;
import net.javacoding.jspider.core.util.URLUtil;
import java.io. * ;
import java.net. * ;
/** */ /**
*
* $Id: SpiderHttpURLTask.java,v 1.19 2003/04/10 16:19:14 vanrogu Exp $
*
* @author G黱ther Van Roey
*/
public class SpiderHttpURLTask extends BaseWorkerTaskImpl ... {
protected URL url;
protected Site site;
public SpiderHttpURLTask(SpiderContext context, URL url, Site site) ...{
super(context, WorkerTask.WORKERTASK_SPIDERTASK);
this.url = url;
this.site = site;
}
public void prepare() ...{
context.throttle(site);
}
public void execute() ...{
CoreEvent event = null;
URLConnection connection = null;
InputStream inputStream = null;
int httpStatus = 0;
HTTPHeader[] headers = null;
try ...{
connection = url.openConnection();
if (connection instanceof HttpURLConnection) ...{
((HttpURLConnection) connection).setInstanceFollowRedirects(false);
}
connection.setRequestProperty("User-agent", site.getUserAgent());
context.preHandle(connection, site);
long start = System.currentTimeMillis();
connection.connect();
if (connection instanceof HttpURLConnection) ...{
httpStatus = ((HttpURLConnection) connection).getResponseCode();
switch (httpStatus) ...{
case HttpURLConnection.HTTP_MOVED_PERM:
case HttpURLConnection.HTTP_MOVED_TEMP:
String redirectURL = connection.getHeaderField("location");
notifyEvent(url, new URLFoundEvent(context, url, URLUtil.normalize(new URL(redirectURL))));
break;
default:
break;
}
}
inputStream = new BufferedInputStream(connection.getInputStream());
ByteArrayOutputStream os = new ByteArrayOutputStream();
InputStream is = new BufferedInputStream(inputStream);
//int size = connection.getContentLength();
int size = 0;
try ...{
int i = is.read();
while (i != -1) ...{
size++;
os.write(i);
i = is.read();
}
} catch (IOException e) ...{
LogFactory.getLog(SpiderHttpURLTask.class).error("i/o exception during fetch",e);
}
String contentType = connection.getContentType();
int timeMs = (int) (System.currentTimeMillis() - start);
headers = HTTPHeaderUtil.getHeaders(connection);
if (httpStatus >= 200 && httpStatus < 303) ...{
event = new URLSpideredOkEvent(context, url, httpStatus, connection, contentType, timeMs, size, os.toByteArray(), headers);
} else ...{
event = new URLSpideredErrorEvent(context, url, httpStatus, connection, headers, null);
}
context.postHandle(connection, site);
} catch (FileNotFoundException e) ...{
headers = HTTPHeaderUtil.getHeaders(connection);
event = new URLSpideredErrorEvent(context, url, 404, connection, headers, e);
} catch (Exception e) ...{
LogFactory.getLog(this.getClass()).error("exception during spidering", e);
event = new URLSpideredErrorEvent(context, url, httpStatus, connection, headers, e);
} finally ...{
notifyEvent(url, event);
if (inputStream != null) ...{
try ...{
inputStream.close();
} catch (IOException e) ...{
LogFactory.getLog(SpiderHttpURLTask.class).error("i/o exception closing inputstream",e);
}
}
}
}
}