前两天看到个问题,抓网页时,返回403,浏览器打开没问题。用的httpclient进行抓取,代码很简单。网上找了一版。
String url="http://localhost:8080/HttpClientDemo/test";
HttpGet httpRequest=new HttpGet(url);
HttpClient httpClient=new DefaultHttpClient();
HttpResponse response=httpClient.execute(httpRequest);
if(response.getStatusLine().getStatusCode()==HttpStatus.SC_OK){
String result=EntityUtils.toString(response.getEntity());
System.out.println(result);
}
要模拟浏览器,就要填http的参数,于是重写一把。
HttpClient client = null;
// 对链接超过时的设置
HttpParams httpParams;
ClientConnectionManager connectionManager;
// 数据设置
// 最大链接数
int max_connection = Integer.parseInt(ReadSpiderConfig
.getValue("max_connections"));
// 获取链接的最大等待时间
int wait_connection_timeout = Integer
.parseInt(ReadSpiderConfig.getValue("wait_connection_timeout"));
// 连接超时时间
int connection_timeout = Integer.parseInt(ReadSpiderConfig
.getValue("connection_timeout"));
// 读取超时
int read_timeout = Integer.parseInt(ReadSpiderConfig
.getValue("read_timeout"));
httpParams = new BasicHttpParams();
// HttpConnectionParams.
ConnManagerParams.setMaxTotalConnections(httpParams, max_connection);
ConnManagerParams.setTimeout(httpParams, wait_connection_timeout);
//每个路由的最大链接个数,标志对同一站点的并发请求
ConnPerRouteBean connPerRoute = new ConnPerRouteBean(100);
ConnManagerParams.setMaxConnectionsPerRoute(httpParams, connPerRoute);
HttpConnectionParams.setConnectionTimeout(httpParams,
connection_timeout);
HttpConnectionParams.setSoTimeout(httpParams, read_timeout);
SchemeRegistry registry = new SchemeRegistry();
registry.register(new Scheme("http", PlainSocketFactory
.getSocketFactory(), 80));
registry.register(new Scheme("https", SSLSocketFactory
.getSocketFactory(), 443));
connectionManager = new ThreadSafeClientConnManager(httpParams,
registry);
// httpParams.setParameter(ClientPNames.HANDLE_REDIRECTS,false);
client = new DefaultHttpClient(connectionManager, httpParams);