第一次动手写爬虫,遇到了很多问题,但编程本身就不是那么容易的,希望自己能一步步的解决掉这些问题吧,下边是目前遇到的问题
package zhuawang;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.methods.*;
import org.apache.commons.codec.*;
public class zhuawang {
private static HttpClient httpClient=new HttpClient();
//设置代理服务器
static{
//设置代理服务器的IP地址和端口
httpClient.getHostConfiguration().setProxy("172.27.35.1", 8080);
}
public static boolean downloadPage(String path) throws HttpException,IOException{
InputStream input=null;
OutputStream output=null;
//得到POST方法
PostMethod postMethod=new PostMethod(path);
//测试post方法的参数
NameValuePair[] postData = new NameValuePair[2];
postData[0] = new NameValuePair("name","baidu");
postData[1] = new NameValuePair("pasword","123456");
postMethod.addParameters(postData);
//执行,返回状态码
int statusCode = httpClient.executeMethod(postMethod);
//针对状态码进行处理(简单起见,只处理返回值为200的状态码)
if(statusCode == HttpStatus.SC_OK)
{
input = postMethod.getResponseBodyAsStream();
//得到文件名
String filename = path.substring(path.lastIndexOf('/')+1);
//获得文件输出流
output = new FileOutputStream(filename);
//输出到文件
int tempByte = -1;
while((tempByte=input.read())>0)
{
output.write(tempByte);
}
//关闭输出流
if(input!=null)
input.close();
if(output!=null)
output.close();
return true;
}
return false;
}
//测试代码
public static void main(String[] args) {
//抓取百度首页
try{
zhuawang.downloadPage("http://localhost:8080/firstTest.htm?method=test");
}catch (HttpException e){
e.printStackTrace();
//System.out.println("程序异常");
}catch (IOException e){
e.printStackTrace();
}
}
}
程序运行后出现异常
request
四月 22, 2016 9:38:30 下午 org.apache.commons.httpclient.HttpMethodDirector executeWithRetry
信息: I/O exception (java.net.ConnectException) caught when processing request: Connection timed out: connect
四月 22, 2016 9:38:30 下午 org.apache.commons.httpclient.HttpMethodDirector executeWithRetry
信息: Retrying request
.四月 22, 2016 9:38:51 下午 org.apache.commons.httpclient.HttpMethodDirector executeWithRetry
信息: I/O exception (java.net.ConnectException) caught when processing request: Connection timed out: connect
四月 22, 2016 9:38:51 下午 org.apache.commons.httpclient.HttpMethodDirector executeWithRetry
信息: Retrying request