最近由于公司业务需求,临时从android 改到了 网络爬虫方面的研究,小公司伤不起呀!在爬取XX网站的时候某些页面会发生URL重定向,重定向的URL包含特殊字符导致访问异常。在网上找了一些资料,研究了一下终于给解决了,分享给大家!
HttpClient 4.0之前版本重定向需要手工处理,4.0以后默认会执行重定向操作,所以需要让它不执行重定向,而由我们自己来处理重定向请求。实现代码如下
示例代码
package com.yulore.test;
import java.io.IOException;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpRequest;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.ParseException;
import org.apache.http.ProtocolException;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.RedirectStrategy;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;
public class HttpClientURLRedirectTest {
/**
* @param args
*/
public static void main(String[] args) {
redirect02();
}
/**
* Http URL重定向
*/
private static void redirect02() {
DefaultHttpClient httpclient = null;
String url = "http://hotels.ctrip.com/hotel/hong-kong58";
try {
httpclient = new DefaultHttpClient();
httpclient.setRedirectStrategy(new RedirectStrategy() { //设置重定向处理方式
@Override
public boolean isRedirected(HttpRequest arg0,
HttpResponse arg1, HttpContext arg2)
throws ProtocolException {
return false;
}
@Override
public HttpUriRequest getRedirect(HttpRequest arg0,
HttpResponse arg1, HttpContext arg2)
throws ProtocolException {
return null;
}
});
// 创建httpget.
HttpGet httpget = new HttpGet(url);
// 执行get请求.
HttpResponse response = httpclient.execute(httpget);
int statusCode = response.getStatusLine().getStatusCode();
if (statusCode == HttpStatus.SC_OK) {
// 获取响应实体
HttpEntity entity = response.getEntity();
if (entity != null) {
// 打印响应内容长度
System.out.println("Response content length: "
+ entity.getContentLength());
// 打印响应内容
System.out.println("Response content: "
+ EntityUtils.toString(entity));
}
} else if (statusCode == HttpStatus.SC_MOVED_TEMPORARILY
|| statusCode == HttpStatus.SC_MOVED_PERMANENTLY) {
System.out.println("当前页面发生重定向了---");
Header[] headers = response.getHeaders("Location");
if(headers!=null && headers.length>0){
String redirectUrl = headers[0].getValue();
System.out.println("重定向的URL:"+redirectUrl);
redirectUrl = redirectUrl.replace(" ", "%20");
get(redirectUrl);
}
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (ParseException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
// 关闭连接,释放资源
httpclient.getConnectionManager().shutdown();
}
}
/**
* 发送 get请求
*/
private static void get(String url) {
HttpClient httpclient = new DefaultHttpClient();
try {
// 创建httpget.
HttpGet httpget = new HttpGet(url);
System.out.println("executing request " + httpget.getURI());
// 执行get请求.
HttpResponse response = httpclient.execute(httpget);
// 获取响应状态
int statusCode = response.getStatusLine().getStatusCode();
if(statusCode==HttpStatus.SC_OK){
// 获取响应实体
HttpEntity entity = response.getEntity();
if (entity != null) {
// 打印响应内容长度
System.out.println("Response content length: "
+ entity.getContentLength());
// 打印响应内容
System.out.println("Response content: "
+ EntityUtils.toString(entity));
}
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (ParseException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
// 关闭连接,释放资源
httpclient.getConnectionManager().shutdown();
}
}
}
运行结果
当前页面发生重定向了---
重定向的URL:http://hotels.ctrip.com/hotel/hong kong58
OK,搞定啦!