import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.methods.PostMethod;
public class RetrivePage {
private static HttpClient httpClient=new HttpClient();
//设置代理服务器
static{
//设置代理服务器的IP地址和端口
//httpClient.getHostConfiguration().setProxy("192.168.28.137", 8080);
}
public static boolean downloadPage(String path) throws HttpException, IOException{
InputStream input=null;
OutputStream output=null;
//得到post方法
PostMethod postMethod=new PostMethod(path);
//设置post方法的参数
/*NameValuePair[] postData=new NameValuePair[2];
postData[0]=new NameValuePair("name","lietu");
postData[1]=new NameValuePair("password","*****");
postMethod.addParameters(postData);*/
//执行,返回状态码
int statusCode=httpClient.executeMethod(postMethod);
//针对状态码进行处理(简单期间,只处理返回值为200的状态码)
if(statusCode==HttpStatus.SC_OK){
input=postMethod.getResponseBodyAsStream();
//得到文件名
//String filename="G://pachon//"+path.substring(path.lastIndexOf('/')+1);
String filename="G:\\pachon\\"+"hello.html";
System.out.println("filtname="+filename);
//获得文件输出流
output=new FileOutputStream(filename);
//输出到文件
int tempBytes=-1;
while((tempBytes=input.read())>0){
output.write(tempBytes);
}
//关闭输入输出流
if(input!=null){
input.close();
}
if(output!=null){
output.close();
}
return true;
}
//若需要转向,则进行转向操作
if((statusCode==HttpStatus.SC_MOVED_TEMPORARILY)||
(statusCode==HttpStatus.SC_MOVED_PERMANENTLY)||
(statusCode==HttpStatus.SC_TEMPORARY_REDIRECT)||
statusCode==HttpStatus.SC_SEE_OTHER){
//获取新的URL地址
Header header=postMethod.getResponseHeader("location");
if(header==null){
String newUrl=header.getValue();
if(newUrl==null||newUrl.equals("")){
newUrl="/";
//使用post转向
PostMethod redirect=new PostMethod(newUrl);
//发送请求,做进一步处理。。。。
}
}
}
return false;
}
//测试代码
public static void main(String[] args) {
//抓取猎兔首页,输出
try {
RetrivePage.downloadPage("http://www.lietuw.com/");
System.out.println("执行成功!");
} catch (HttpException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
Java抓取网页爬虫
最新推荐文章于 2024-07-17 08:51:56 发布