爬虫技术(2)--抓取网页java代码实现

最新推荐文章于 2018-09-24 14:37:00 发布

波比小金刚

最新推荐文章于 2018-09-24 14:37:00 发布

阅读量670

点赞数

分类专栏：搜索引擎

本文链接：https://blog.csdn.net/javaexploreroooo/article/details/51792031

版权

搜索引擎专栏收录该内容

2 篇文章 0 订阅

订阅专栏

package creeper.part1.capturepage;
import java.io.IOException;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
//爬虫技术(1)--抓取网页
@SuppressWarnings("unused")
public class capturePage {
	public static void main(String[] args) throws Exception {
		//声明一个HttpClient客户端，相当于打开一个浏览器(4.3以后都是CloseableHttpClient以前的已经过时)
		CloseableHttpClient httpClient=HttpClients.createDefault();
		//创建代理，省略...
		
		try {
			//get方法,相当于打开了一个网页
			String url="http://www.baidu.com";
			HttpGet get=new HttpGet(url);
			System.out.println("---------URI----------");
			System.out.println(get.getURI());
			
			//创建响应处理器处理响应内容
			ResponseHandler<String> handler=new ResponseHandler<String>(){
				@Override
				public String handleResponse(HttpResponse response)
						throws ClientProtocolException, IOException {
					int status=response.getStatusLine().getStatusCode();//获取响应状态码
					//对状态码进行判断处理
					if(status>=200 && status<300 ){
						HttpEntity entity=response.getEntity();//获取响应的数据
						return entity==null?null:EntityUtils.toString(entity);
					}else{
						throw new ClientProtocolException("status:"+status);
					}
				}
			};
			//发送请求，相当于敲个回车
			String responseBody=httpClient.execute(get, handler);
			System.out.println("----------------responseBody-----------------");
			System.out.println(responseBody);
			System.out.println("----------------responseBody-----------------");
		} catch (Exception e) {
			
		}finally{
			httpClient.close();
		}
	}
}