1.5抓取网页
现在我们把所有的都放在一起,尝试着做一次网页的抓取,大概的思路已经十分清楚了,首先建立连接,然后下载网页的内容,最后将网页内容存储到本地硬盘中。
import java.io.BufferedInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.concurrent.Callable;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import book.SimpleClient;
public class DownLoadFile{
/**
* 根据 url 和网页类型生成需要保存的网页的文件名 去除掉 url 中非文件名字符
*/
public String getFileNameByUrl(String url,String contentType)
{
//remove http://
url=url.substring(7);
//text/html类型
if(contentType.indexOf("html")!=-1||contentType.indexOf("htm")!=-1)
{
url= url.replaceAll("[\\?/:*|<>\"]", "_")+".html";
return url;
}
//如application/pdf类型
else
{
return url.replaceAll("[\\?/:*|<>\"]", "_")+"."+
contentType.substring(contentType.lastIndexOf("/")+1,contentType.lastIndexOf(';'));
}
}
/**
* 保存网页字节数组到本地文件 filePath 为要保存的文件的相对地址
*/
private void saveToLocal(byte[] data, String filePath) {
try {
DataOutputStream out = new DataOutputStream(new FileOutputStream(
new File(filePath)));
for (int i = 0; i < data.length; i++)
out.write(data[i]);
out.flush();
out.close();
} catch (IOException e) {
e.printStackTrace();
}
}
private void saveToLocal(InputStream input,String filePath){
try {
DataOutputStream out = new DataOutputStream(new FileOutputStream(
new File(filePath)));
byte[] buffer=new byte[200];
int code=input.read(buffer);
while(code>-1)
{
for (int i = 0; i < buffer.length; i++)
out.write(buffer[i]);
code=input.read(buffer);
}
out.flush();
input.close();
out.close();
} catch (IOException e) {
e.printStackTrace();
}
}
注意在上文中,你也许会注意到这个警告:七月 14, 2014 8:42:59 下午 org.apache.commons.httpclient.HttpMethodBase getResponseBody
警告: Going to buffer response body of large or unknown size. Using getResponseBodyAsStream instead is recommended.
并且警告已经给出了解决问题的办法,所以按照它说的来吧,byte[] buffer=new byte[200];这里设置了一个缓冲区,暂时将其设置为200字节,有了缓冲区,下载内容就会方便很多。
GetMethod getMethod;
protected void setClient(HttpClient httpClient,String url){
// 设置 Http 连接超时 5s
httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(
5000);
/* 2.生成 GetMethod 对象并设置参数 */
getMethod = new GetMethod(url);
// 设置 get 请求超时 5s
getMethod.getParams().setParameter(HttpMethodParams.SO_TIMEOUT, 5000);
// 设置请求重试处理
getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
new DefaultHttpMethodRetryHandler());
}
SimpleClient sc=new SimpleClient();
/* 下载 url 指向的网页 */
public String downloadFile(String url) {
String filePath = null;
/* 1.生成 HttpClinet 对象并设置参数 */
HttpClient httpClient =sc.getSingnal();
setClient(httpClient,url);
/* 3.执行 HTTP GET 请求 */
int statusCode;
try {
statusCode = httpClient.executeMethod(getMethod);
// 判断访问的状态码
if (statusCode != HttpStatus.SC_OK) {
System.err.println("Method failed: "
+ getMethod.getStatusLine());
filePath = null;
return filePath ;
}
//处理重定向
if ((statusCode == HttpStatus.SC_MOVED_TEMPORARILY) ||
(statusCode == HttpStatus.SC_MOVED_PERMANENTLY) ||
(statusCode == HttpStatus.SC_SEE_OTHER) ||
(statusCode == HttpStatus.SC_TEMPORARY_REDIRECT)
)
{
//读取新的URL地址
Header header = getMethod.getResponseHeader("location");
if (header != null){
String newuri = header.getValue();
if ((newuri == null) || (newuri.equals("")))
newuri = "/";
GetMethod redirect = new GetMethod(newuri);
httpClient.executeMethod(redirect);
System.out.println("Redirect:"+
redirect.getStatusLine().toString());
} else{
System.out.println("Invalid redirect");
}
}
/* 4.处理 HTTP 响应内容 */
InputStream is=getMethod.getResponseBodyAsStream();
BufferedInputStream nin=new BufferedInputStream(is,2000);
// 根据网页 url 生成保存时的文件名
filePath = fileprefix
+ getFileNameByUrl(url,
getMethod.getResponseHeader("Content-Type")
.getValue());
saveToLocal(nin, filePath);
} catch (HttpException e) {
// 发生致命的异常,可能是协议不对或者返回的内容有问题
System.out.println("Please check your provided http address!");
e.printStackTrace();
} catch (IOException e) {
// 发生网络异常
e.printStackTrace();
} finally {
// 释放连接
getMethod.releaseConnection();
}
return filePath;
}
//设置文件前缀
public String getFileprefix() {
return fileprefix;
}
public void setFileprefix(String fileprefix) {
this.fileprefix = fileprefix;
}
public static void main(String[] args){
DownLoadFile dl = new DownLoadFile();
dl.setFileprefix("E:\\html\\");
dl.downloadFile("http://bbs.tianya.cn/list-456-1.shtml");
}
}
运行结果如下:
我们成功抓取了这个网页,考虑到以后要抓取很多的网页,我们必须对抓取进行有效的管理,比如计算抓取的效率,统计抓取的网页数,等等。
public class DownloadManager {
private DownLoadFile download=null;
// 下载的数量
private static int downLoadNumber;
//下载数量增加
public void add(){
downLoadNumber++;
}
private long startTime;
public int getDownLoadNumber(){
return downLoadNumber;
}
private String name="";
public DownloadManager(){
download=new DownLoadFile();
}
public void beginDownload(String url){
name=download.downloadFile(url);
}
public void stopDownLoad(){
download=null;
}
public String getName(){
return name;
}
// 统计下载的时间
public long downLoadTime(){
// 以秒为单位
return (System.currentTimeMillis()-startTime)/1000;
}
// 计算下载效率
public float downLoadEfficiency(){
return downLoadNumber/downLoadTime();
}
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
}
public long getStartTime() {
return startTime;
}
public void setStartTime(long startTime) {
this.startTime = startTime;
}
}