【Java爬虫】005-HttpClient学习笔记(补充:网页内容获取相关)

最后更新时间:2020年8月31日11:42:37

 

一、创建HttpClient实例的6种方法

package com.zb.crawler.httpclient;

import org.apache.http.client.HttpClient;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;

public class Main {
    public static void main(String[] args) {
        //6种实例化HttpClient的方式
        //第一种方法已过时,不建议使用
        HttpClient httpClient1 = new DefaultHttpClient();
        HttpClient httpClient2 = HttpClients.custom().build();
        HttpClient httpClient3 = HttpClientBuilder.create().build();
        HttpClient httpClient4 = HttpClients.createDefault();
        HttpClient httpClient5 = HttpClients.createSystem();
        HttpClient httpClient6 = HttpClients.createMinimal();
    }
}

 

二、设置头消息

1、方式一:一个一个设置

package com.zb.book.httpclient;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

public class SetHeaderOne {
    public static void main(String[] args) throws IOException {
        //初始化httpClient
        HttpClient httpClient = HttpClients.custom().build();
        //创建get请求
        HttpGet httpGet = new HttpGet("http://www.baidu.com/");
        //请求头配置
        httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng.*/*;q=0.8");
        httpGet.setHeader("Accept-Encoding","gzip,deflate");
        httpGet.setHeader("Accept-Language","zh-CN,zh;q=0.9");
        httpGet.setHeader("Cache-Control","max-age=0");
        httpGet.setHeader("Host","http://www.baidu.com/");
        httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36");
        //发出Get请求
        HttpResponse response = httpClient.execute(httpGet);
        //获取响应状态码
        int statusCode = response.getStatusLine().getStatusCode();
        if(statusCode==200) {
            //获取网页内容流
            HttpEntity entity = response.getEntity();
            //转换为字符串形式,需要设置编码
            String content = EntityUtils.toString(response.getEntity(), "UTF-8");
            System.out.println(content);
            //关闭内容流
            EntityUtils.consume(entity);
        }
    }
}

 

2、方式二:封装到list集合统一设置

package com.zb.book.httpclient;

import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHeaders;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicHeader;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class SetHeaderList {
    public static void main(String[] args) throws IOException {
        //通过集合封装头信息
        List<Header> headerList = new ArrayList<>();
        headerList.add(new BasicHeader(HttpHeaders.ACCEPT,"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng.*/*;q=0.8"));
        headerList.add(new BasicHeader(HttpHeaders.ACCEPT_ENCODING,"gzip,deflate"));
        headerList.add(new BasicHeader(HttpHeaders.ACCEPT_LANGUAGE,"zh-CN,zh;q=0.9"));
        headerList.add(new BasicHeader(HttpHeaders.CACHE_CONTROL,"max-age=0"));
        headerList.add(new BasicHeader(HttpHeaders.HOST,"http://www.baidu.com/"));
        headerList.add(new BasicHeader(HttpHeaders.USER_AGENT,"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"));
        //初始化httpClient
        HttpClient httpClient = HttpClients.custom().setDefaultHeaders(headerList).build();
        //创建get请求
        HttpGet httpGet = new HttpGet("http://www.baidu.com/");
        //发出Get请求
        HttpResponse response = httpClient.execute(httpGet);
        //获取响应状态码
        int statusCode = response.getStatusLine().getStatusCode();
        if(statusCode==200) {
            //获取网页内容流
            HttpEntity entity = response.getEntity();
            //转换为字符串形式,需要设置编码
            String content = EntityUtils.toString(response.getEntity(), "UTF-8");
            System.out.println(content);
            //关闭内容流
            EntityUtils.consume(entity);
        }
    }
}

 

3、其他方式

非常简单,可参考前两种方式实现;

 

三、POST提交表单

package com.zb.book.httpclient;

import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.message.BasicNameValuePair;

import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;

//Post提交表单
public class PostSubmitForm {
    public static void main(String[] args) throws UnsupportedEncodingException {
        //使用list集合存储欲传递参数
        List<NameValuePair> nvps = new ArrayList<>();
        nvps.add(new BasicNameValuePair("param1","value1"));
        nvps.add(new BasicNameValuePair("param2","value2"));
        //创建UrlEncodedFormEntity对象
        UrlEncodedFormEntity entity = new UrlEncodedFormEntity(nvps, "UTF-8");
        //创建HttpPost
        HttpPost httpPost = new HttpPost("http://xxx.xxx.xxx");
        httpPost.setEntity(entity);
        //执行该请求即可实现提交表单
    }
}

 

四、超时设置

1、概述

使用HttpClient可设置三种超时时间:RequestTimeout(获取连接超时时间)、ConnectTimeout(建立连接超时时间)、SocketTimeout(获取数据超时时间)。配置这三种超时时间,需要用到HttpClient的RequestConfig类中的方法custom(),该方法返回值为实例化的内部类Builder(配置器),其功能是配置先关请求的字段,还可以设置代理(proxy)、Cookie规范(cookieSpec)、是否允许HTTP相关认证等;

 

2、代码演示

package com.zb.book.httpclient;

import org.apache.http.client.config.RequestConfig;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;

//设置超时时间
public class SetTimeout {
    public static void main(String[] args) {
        //创建RequestConfig配置,全部设置为10秒
        RequestConfig requestConfig = RequestConfig.custom()
                .setSocketTimeout(10000)//SocketTimeout(获取数据超时时间)
                .setConnectTimeout(10000)//ConnectTimeout(建立连接超时时间)
                .setConnectionRequestTimeout(1000)//RequestTimeout(获取连接超时时间)
                .build();
        //配置到httpclient
        CloseableHttpClient httpClient = HttpClients.custom().setDefaultRequestConfig(requestConfig).build();
        //后面进行正常的请求及相关处理即可
        //另外可用请求方法设置配置,其他写法当做适当改变
        //httpGet.setConfig(requestConfig);
    }
}

 

五、代理服务器的设置

package com.zb.book.httpclient;

import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;

//设置代理服务器
public class SetProxy {
    public static void main(String[] args) {
        //创建RequestConfig配置,全部设置为10秒
        RequestConfig requestConfig = RequestConfig.custom()
                .setProxy(new HttpHost("171.221.239.11",808,null))
                .build();
        //配置到httpclient
        CloseableHttpClient httpClient = HttpClients.custom().setDefaultRequestConfig(requestConfig).build();
        //后面进行正常的请求及相关处理即可
        //另外可用请求方法设置配置,其他写法当做适当改变
        //httpGet.setConfig(requestConfig);
    }
}

 

六、文件下载

1、概述

下载HTML、图片、PDF和压缩包等文件时,一种方法是使用HttpEntity类将响应实体转化为字节数组,再利用输出流的方式写入指定文件。另一种方法是使用HttpEntity类中的writeTo(OutputStream)方法,直接将响应实体写入指定的输出流中,这种方法简单切常用,代码演示如下。

 

2、代码演示

package com.zb.book.httpclient;

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.FileOutputStream;
import java.io.IOException;

//下载文件
public class DownloadFile {
    public static void main(String[] args) throws IOException {
        //创建HttpClient对象
        CloseableHttpClient httpClient = HttpClients.custom().build();
        //创建HttpGet对象
        HttpGet httpGet = new HttpGet("https://www.baidu.com/img/PCtm_d9c8750bed0b3c7d089fa7d55720d6cf.png");
        //获取结果
        CloseableHttpResponse response = httpClient.execute(httpGet);
        HttpEntity httpEntity = response.getEntity();
        //写出
        httpEntity.writeTo(new FileOutputStream("C:\\Users\\ZiBo\\Desktop\\1.png"));
        //消耗实体
        EntityUtils.consume(httpEntity);
    }
}

 

七、HTTPS请求认证

1、概述

与jsoup类似,具体做法见代码演示;

 

2、代码演示

SSLClient类:

package com.httpclient.ssl;

import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.X509Certificate;
import java.util.Arrays;
import javax.net.ssl.SSLContext;
import javax.net.ssl.X509TrustManager;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.AuthSchemes;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.NoopHostnameVerifier;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
public class SSLClient {
	/**
	 * 基于SSL配置httpClient
	 * @param  SSLProtocolVersion(SSL, SSLv3, TLS, TLSv1, TLSv1.1, TLSv1.2)
	 * @return httpClient
	 */
	public HttpClient initSSLClient(String SSLProtocolVersion){
		RequestConfig defaultConfig = null;  
		PoolingHttpClientConnectionManager pcm = null;
		try {
			X509TrustManager xtm = new SSL509TrustManager(); //创建信任管理
			//创建SSLContext对象,,并使用指定的信任管理器初始化
			SSLContext context = SSLContext.getInstance(SSLProtocolVersion);
			context.init(null, new X509TrustManager[]{xtm}, null);
			//从SSLContext对象中得到SSLConnectionSocketFactory对象
			SSLConnectionSocketFactory sslConnectionSocketFactory = new SSLConnectionSocketFactory(context, NoopHostnameVerifier.INSTANCE);
			/*从SSLContext对象中得到SSLConnectionSocketFactory对象
			*NoopHostnameVerifier.INSTANCE表示接受接受任何有效的和符合目标主机的SSL会话
			*/
			Registry<ConnectionSocketFactory> sfr = RegistryBuilder.<ConnectionSocketFactory>create()
					.register("http", PlainConnectionSocketFactory.INSTANCE)
					.register("https", sslConnectionSocketFactory).build();
			//基于配置创建连接池
			pcm = new PoolingHttpClientConnectionManager(sfr);
		}catch(NoSuchAlgorithmException | KeyManagementException e){
			e.printStackTrace();
		}
		//设置全局请求配置,包括Cookie规范,HTTP认证,超时
		defaultConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.STANDARD_STRICT)
				.setExpectContinueEnabled(true)
				.setTargetPreferredAuthSchemes(Arrays.asList(AuthSchemes.NTLM, AuthSchemes.DIGEST))
				.setProxyPreferredAuthSchemes(Arrays.asList(AuthSchemes.BASIC))
				.setConnectionRequestTimeout(30*1000)
                .setConnectTimeout(30*1000)
                .setSocketTimeout(30*1000)
                .build();
		//初始化httpclient
		HttpClient httpClient = HttpClients.custom().setConnectionManager(pcm).setDefaultRequestConfig(defaultConfig)
				.build();
		return httpClient;
	}
	//实现X509TrustManager接口
	private static class SSL509TrustManager implements X509TrustManager {
		//检查客户端证书
		public void checkClientTrusted(X509Certificate[] x509Certificates, String s) {
			//do nothing 接受任意客户端证书
		}
		//检查服务器端证书  
		public void checkServerTrusted(X509Certificate[] x509Certificates, String s)  {
			//do nothing  接受任意服务端证书
		}
		//返回受信任的X509证书
		public X509Certificate[] getAcceptedIssuers() {
			return new X509Certificate[0];
		}
	};
}

test类:

package com.httpclient.ssl;

import java.io.IOException;

import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.ParseException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.util.EntityUtils;
public class Test {
	public static void main(String[] args) throws ParseException, IOException {
		String url = "https://cn.kompass.com/a/hospitality-tourism-hotel-and-catering-industries/78/";
		SSLClient sslClient = new SSLClient();   //实例化
		HttpClient httpClientSSL = sslClient.initSSLClient("TLS");
		HttpGet httpGet = new HttpGet(url);
		//获取结果
		HttpResponse httpResponse = null;
		try {
			httpResponse = httpClientSSL.execute(httpGet);
		} catch (IOException e) {
			e.printStackTrace();
		}
		if(httpResponse .getStatusLine().getStatusCode() == HttpStatus.SC_OK){ //状态码200表示响应成功
			//获取实体内容
			String entity = EntityUtils.toString (httpResponse.getEntity(),"UTF-8");
			//输出实体内容
			System.out.println(entity);
			EntityUtils.consume(httpResponse.getEntity());       //消耗实体
		}else {
			//关闭HttpEntity的流实体
			EntityUtils.consume(httpResponse.getEntity());        //消耗实体
		}
	}
}

 

八、请求重试

1、概述

使用HtpClient请求URL时,有时会出现请求异常的情况。针对一些非致命的异常,可以通过请求重试解决。HttpClient提供了默认重试策略DefalutHttpRequestRetryHandler。DefalutHttpRequestRetryHandler类实现了HttpRequestRetryHandler接口,重写了retryRequest(方法。由源码可以发现DefalutHttpRequestRetryHandler类定义的默认重试次数为3次;幂等方法(如GET和HEAD是幂等的)可以重试:如果网页请求失败,可以重试。另外,针对4种异常不进行重试,这四种异常分别是InterruptedIOException (线程中断异常)、UnknownHostException (未知的Host异常)、ConnectException (连接异常,如连接拒绝异常)和SSLException ( HTTPS请求认证异常)。

 

2、代码演示

package com.zb.book.httpclient;

import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
import org.apache.http.impl.client.HttpClients;

//设置请求重试
public class SetRequestRetry {
    public static void main(String[] args) {
        //第一种:默认重试3次
        HttpClients.custom()
                .setRetryHandler(new DefaultHttpRequestRetryHandler())
                .build();
        //第二种:自定义重试5次
        HttpClients.custom()
                .setRetryHandler(new DefaultHttpRequestRetryHandler(5,true))
                .build();
    }
}

 

3、补充说明

值得注意的是,在进行数据爬取时经常遇到的两种超时时间: ConnectTimeout(建立连接的超时时间)和SocketTimeout(获取数据的超时时间),这两种超时时间对应的异常( ConnectTimeoutException与SocketTimeoutException )都继承自InterruptedIOException类,即属于线程中断异常,不会进行重试。

 

九、多线程执行请求演示代码摘录

(可参考原始学习笔记的连接池)

package com.httpclient.thread;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.CodingErrorAction;
import java.util.Arrays;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.apache.http.Consts;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.AuthSchemes;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.config.ConnectionConfig;
import org.apache.http.config.SocketConfig;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;
public class Test {
	public static void main(String[] args) throws FileNotFoundException {
		//添加连接参数
		ConnectionConfig connectionConfig = ConnectionConfig.custom()
				.setMalformedInputAction(CodingErrorAction.IGNORE)
				.setUnmappableInputAction(CodingErrorAction.IGNORE)
				.setCharset(Consts.UTF_8)
				.build();
		//添加socket参数
		SocketConfig socketConfig = SocketConfig.custom()
				.setTcpNoDelay(true)
				.build();
		//配置连接池管理器
		PoolingHttpClientConnectionManager pcm = new PoolingHttpClientConnectionManager();
		// 设置最大连接数
		pcm.setMaxTotal(100);
		// 设置每个连接的路由数
		pcm.setDefaultMaxPerRoute(10);
		//设置连接信息
		pcm.setDefaultConnectionConfig(connectionConfig);
		//设置socket信息
		pcm.setDefaultSocketConfig(socketConfig);
		//设置全局请求配置,包括Cookie规范,HTTP认证,超时
		RequestConfig defaultConfig = RequestConfig.custom()
				.setCookieSpec(CookieSpecs.STANDARD_STRICT)
				.setExpectContinueEnabled(true)
				.setTargetPreferredAuthSchemes(Arrays
						.asList(AuthSchemes.NTLM, AuthSchemes.DIGEST))
				.setProxyPreferredAuthSchemes(Arrays.asList(AuthSchemes.BASIC))
				.setConnectionRequestTimeout(30*1000)
				.setConnectTimeout(30*1000)
				.setSocketTimeout(30*1000)
				.build();
		CloseableHttpClient httpClient = HttpClients.custom()
				.setConnectionManager(pcm)
				.setDefaultRequestConfig(defaultConfig)
				.build();
		// 请求的URL
		String[] urlArr = {
				"https://hbr.org/podcasts",
				"https://hbr.org/magazine",
				"https://hbr.org/most-popular",
				"https://hbr.org/big-ideas",
				"https://hbr.org/reading-lists"
		};
		//创建固定大小的线程池
		ExecutorService exec = Executors.newFixedThreadPool(3);
		for(int i = 0; i< urlArr.length;i++){
			String filename = urlArr[i].split("org/")[1]; //HTML需要输出的文件名
			//创建HTML文件输出目录
			OutputStream out = new FileOutputStream("file/" + filename);
			HttpGet httpget = new HttpGet(urlArr[i]);
			//启动线程执行请求
			exec.execute(new DownHtmlFileThread(httpClient, httpget, out));
		}
		//关闭线程
		exec.shutdown();
	}
	static class DownHtmlFileThread extends Thread {
		private final CloseableHttpClient httpClient;
		private final HttpContext context;
		private final HttpGet httpget;
		private final OutputStream out;
		//输入的参数
		public DownHtmlFileThread(CloseableHttpClient httpClient, 
				HttpGet httpget, OutputStream out) {
			this.httpClient = httpClient;
			this.context = HttpClientContext.create();
			this.httpget = httpget;
			this.out = out;
		}
		@Override
		public void run() {
			System.out.println(Thread.currentThread().getName() + 
					"线程请求的URL为:" + httpget.getURI());
			try {
				CloseableHttpResponse response = httpClient.execute(
						httpget, context);  //执行请求
				try {
					//HTML文件写入文档
					out.write(EntityUtils.toString(response.getEntity(),"gbk")
							.getBytes());
					out.close();
					//消耗实体
					EntityUtils.consume(response.getEntity());
				} finally{
					response.close(); //关闭响应
				}
			} catch (ClientProtocolException ex) {
				ex.printStackTrace(); // 处理 protocol错误
			} catch (IOException ex) {
				ex.printStackTrace(); // 处理I/O错误
			}
		}
	}
}

 

 

 

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值