网络爬虫:
HttpClient
网络爬虫就是用程序帮助我们访问网络上的资源,我们一直
以来都使用HTTP协议访问互联网的网页,
网络爬虫需要编写程序,在这里同样使用HTTP协议访问网页。
这里我们使用java的HTTP协议客户端HttpClient这个技术,来实现抓取网页数据。
4.1 GET请求不含参
//创建HttpClient的对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//创建HttpGet对象,设置url访问地址
HttpGet httpGet = new HttpGet("http://www.itheima.com");
CloseableHttpResponse response = null;
try {
//使用HttpClient发起请求,获取response
response = httpClient.execute(httpGet);
//解析响应
if (response.getStatusLine().getStatusCode() == 200) {
String content = EntityUtils.toString(response.getEntity(), "utf8");
System.out.println(content.length());
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
(2)get含参。
//创建HttpClientduixaing
CloseableHttpClient httpClient = HttpClients.createDefault();
//设置访问的路径:https://www.boxuegu.com/subject/gather/indexhm.html?a=tQ7qmh&utm_source=heima
//创建URIBuilder
URIBuilder uriBuilder =
new URIBuilder("https://www.boxuegu.com/subject/gather/indexhm.html");
uriBuilder.setParameter("a", "tQ7qmh").setParameter("utm_source", "heima");
//创建HttoGet对象.
HttpGet httpGet = new HttpGet(uriBuilder.build());
System.out.println("发送的信息是" + httpGet);
(3)post
//创建HttpClient的对象。
CloseableHttpClient httpClient = HttpClients.createDefault();
//创建HttpPost对象,设置url访问地址。
HttpPost httpPost = new HttpPost("http://www.itheima.com");
(4)Post带参请求。
HttpPost httPost = new HttpPost("https://www.boxuegu.com/subject/gather/indexhm.html");
//声明List集合,封装表单中的参数
List<NameValuePair> params = new ArrayList<NameValuePair>();
//设置请求地址:
params.add(new BasicNameValuePair("a", "tQ7qmh"));
params.add(new BasicNameValuePair("utm_source", "heima"));
//创建表单的Entity对象,第一个参数就是封装好多表单数据,第二个参数
//就是编码。
UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params
, "utf8");
httPost.setEntity(formEntity);
CloseableHttpResponse response = null;
(5) 连接池管理
public static void main(String[] args) {
//创建连接池管理器
PoolingHttpClientConnectionManager mc = new PoolingHttpClientConnectionManager();
//设置最大连接数
mc.setMaxTotal(100);
//设置每个主机(就是各大网站。)的最大连接数
mc.setDefaultMaxPerRoute(10);
//使用连接池管理器发起请求
doGet(mc);
doGet(mc);
}
private static void doGet(PoolingHttpClientConnectionManager mc) {
//不是每次创新的 HttpClient ,而是从连接池中获取HttpClient对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(mc).build();
HttpGet httpGet = new HttpGet("http://www.itheima.com");
CloseableHttpResponse response = null;
(6)請求時間设置为了能够可以有利于爬虫爬取信息的效率。
//设置请求的信息
RequestConfig config = RequestConfig.custom().setConnectTimeout(1000)
.setConnectionRequestTimeout(500)
.setSocketTimeout(10 * 1000).build();
//给请求设置请求的信息
httpGet.setConfig(config);
姬冰可