实战:纯手工打造Java爬虫——基于JDK11原生HttpClient(三)

5 篇文章 0 订阅
5 篇文章 0 订阅

目录

家园(Homestead)

请求配置封装

 请求工具封装

响应封装


前两篇已经将基本环境和基础工具维护好了,本章开始要来点干货了

家园(Homestead

爬虫的本质还是“请求”,我想在座的应该都同意,但是网络常见请求无非就那几种,Socket请求、Http请求、FTP请求……,我们要做web页面的爬虫,那肯定是需要一个Http请求工具了。

最前面我们提到Apache的HttpClient工具,其实网络上这个工具的使用教程应该是最多的,JDK实际上一直也有相关的Http请求工具,但是一直都被诟病,直到……直到JDK11的出现,也预示着JDK的Http请求工具(HttpClient)基本接近完善(方便好用),所以才有了我们这一篇文章(要不然我也用Apache了……)。

要实现JDK的HttpClient封装,我们得明确封装目标:

1.请求配置封装

2.请求工具封装

3.响应封装

其实在诸多Http请求工具中,无非就这三点封装罢了,所以这里也不装了,直接上代码。

请求配置封装

封装请求配置的目的是为了尽可能简化使用时的复杂度,因此我们要明确哪些是必要参数,哪些是非必要参数,哪些参数需要有默认值,有时候我们为了满足多种情况,所以会出现重载、重写的情况,甚至有些功能我们可能暂时用不到,但是以后可能用得到,所以要封装就不要怕麻烦,要封装就要越仔细,为了统一我们的配置初始化问题,因此我们的HttpConfig就出现了,为了更进一步简化工具的初始化,我们HttpConfig采用构造器模式来构建Http请求工具。

HttpConfig.java

package com.vtarj.pythagoras.explore;

import javax.net.ssl.SSLContext;
import java.net.*;
import java.net.http.HttpClient;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.time.Duration;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.StringJoiner;
import java.util.concurrent.Executor;
import java.util.concurrent.Executors;

/**
 * @Author Vtarj
 * @Description HttpExplore探测器配置工具
 * @Time 2022/4/1 9:46
 **/
public class HttpConfig {

    /** * Http版本,默认HTTP_2 */
    private HttpClient.Version version;
    /** * 转发策略 */
    private HttpClient.Redirect redirect;
    /** * 连接超时时间,毫秒 */
    private Duration connectTimeout;
    /** * 线程池,默认5个连接 */
    private Executor executor;
    /** * 认证信息 */
    private Authenticator authenticator;
    /** * 代理信息 */
    private ProxySelector proxySelector;
    /** * Cookies信息 */
    private CookieHandler cookieHandler;
    /** * SSL连接信息 */
    private SSLContext sslContext;
    /** * 给客户端发送HTTP/2请求时的默认优先级,取值范围1~256 */
    private int priority;

    /** * 定义请求头信息 */
    private Map<String,String> headerMap;
    /** * 定义默认ContextType */
    private static final String DEFAULT_CONTEXT_TYPE = "application/json";
    /** * 定义请求参数 */
    private Map<String,Object> requestParams;
    /** * 定义请求方法 */
    private String requestMethod;
    /** * 定义请求地址 */
    private URI requestURI;

    /** * 定义请求字符编码 */
    private Charset reqCode = StandardCharsets.UTF_8;
    /** * 定义响应字符编码 */
    private Charset resCode = StandardCharsets.UTF_8;
    /** * 定义配置锁,用于确认是否首次配置生效 */
    private boolean locked = true;

    /**
     * 构造配置工具,初始化默认参数
     */
    public HttpConfig() {
        version = HttpClient.Version.HTTP_2;
        redirect = HttpClient.Redirect.NORMAL;
        connectTimeout = Duration.ofMinutes(2);
        setHeader("Content-Type",DEFAULT_CONTEXT_TYPE);
        setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36 Edg/99.0.1150.55");
        requestMethod = "GET";
        priority = 1;
    }

    public HttpClient.Version getVersion() {
        return version;
    }

    public HttpConfig setVersion(HttpClient.Version version) {
        this.version = version;
        return this;
    }

    public HttpClient.Redirect getRedirect() {
        return redirect;
    }

    public HttpConfig setRedirect(HttpClient.Redirect redirect) {
        this.redirect = redirect;
        return this;
    }

    public Duration getConnectTimeout() {
        return connectTimeout;
    }

    public HttpConfig setConnectTimeout(Duration connectTimeout) {
        this.connectTimeout = connectTimeout;
        return this;
    }

    public Executor getExecutor() {
        if(executor == null){
            this.executor = Executors.newFixedThreadPool(5);
        }
        return executor;
    }

    public HttpConfig setExecutor(Executor executor) {
        this.executor = executor;
        return this;
    }

    public Authenticator getAuthenticator() {
        return authenticator;
    }

    public HttpConfig setAuthenticator(Authenticator authenticator) {
        this.authenticator = authenticator;
        return this;
    }

    public ProxySelector getProxySelector() {
        return proxySelector;
    }

    public HttpConfig setProxySelector(ProxySelector proxySelector) {
        this.proxySelector = proxySelector;
        return this;
    }

    public CookieHandler getCookieHandler() {
        return cookieHandler;
    }

    public HttpConfig setCookieHandler(CookieHandler cookieHandler) {
        this.cookieHandler = cookieHandler;
        return this;
    }

    public SSLContext getSslContext() {
        return sslContext;
    }

    public HttpConfig setSslContext(SSLContext sslContext) {
        this.sslContext = sslContext;
        return this;
    }

    public int getPriority() {
        return priority;
    }

    public HttpConfig setPriority(int priority) {
        if(priority < 1 || priority > 256){
            throw new RuntimeException("您输入的优先级不合法,优先级范围[1~256]");
        }
        this.priority = priority;
        return this;
    }

    public Map<String, String> getHeaderMap() {
        return headerMap;
    }

    /**
     * 设置Header信息
     * @param key   header标识
     * @param value header值
     * @return  配置信息,用于链式调用
     */
    public HttpConfig setHeader(String key,String value){
        if (headerMap == null) {
            headerMap = new HashMap<>();
        }
        headerMap.put(key,value);
        return this;
    }

    /**
     * 通过Map方式设置Header信息,仅做内容合并,不做内容替换,避免缺失的参数丢失
     * @param headerMap Map形式的Header信息
     * @return  返回HttpConfig,以便于链式调用
     */
    public HttpConfig setHeaderMap(Map<String, String> headerMap) {
        this.headerMap.putAll(headerMap);
        return this;
    }

    public Map<String, Object> getRequestParams() {
        return requestParams;
    }

    public HttpConfig setRequestParam(String key,String value) {
        if (requestParams == null){
            requestParams = new HashMap<>();
        }
        requestParams.put(key,value);
        return this;
    }

    public HttpConfig setRequestParams(Map<String, Object> requestParams) {
        this.requestParams = requestParams;
        return this;
    }

    public String getRequestMethod() {
        return requestMethod;
    }

    /**
     * 设置请求方法
     * @param requestMethod 请求方法,取值:"GET","POST","PUT","DELETE"
     * @return  返回HttpConfig对象,用于链式调用
     */
    public HttpConfig setRequestMethod(String requestMethod) {
        String[] methods = {"GET","POST","PUT","DELETE"};
        if(!Arrays.asList(methods).contains(requestMethod.toUpperCase())){
            throw new RuntimeException("请求方法设置错误,不符合规范要求");
        }
        this.requestMethod = requestMethod.toUpperCase();
        return this;
    }

    public URI getRequestURI() {
        return requestURI;
    }

    public HttpConfig setRequestURI(String requestURI) {
        this.requestURI = formatURI(requestURI);
        return this;
    }

    /**
     * 格式化URI地址,补全缺失部分
     * @param uri   待格式化URI地址
     * @return  格式化后的URI地址
     */
    private URI formatURI(String uri) {
        if (!uri.toLowerCase().startsWith("http://") && !uri.toLowerCase().startsWith("https://")){
            uri = "http://" + uri;
        }
        try {
            return new URI(uri);
        } catch (URISyntaxException e) {
            throw new RuntimeException(e);
        }
    }

    public Charset getReqCode() {
        return reqCode;
    }

    public HttpConfig setReqCode(Charset reqCode) {
        this.reqCode = reqCode;
        return this;
    }

    public Charset getResCode() {
        return resCode;
    }

    public HttpConfig setResCode(Charset resCode) {
        this.resCode = resCode;
        return this;
    }

    public boolean isLocked() {
        return locked;
    }

    public HttpConfig setLocked(boolean locked) {
        this.locked = locked;
        return this;
    }

    /**
     * 构造HttpExplore
     * @return  初始化HttpExplore
     */
    public HttpExplore build(){
        //针对Get方法,组装传递参数
        String paramsStr = paramsToString();
        if (paramsStr != null && requestMethod.equals("GET")){
            if (requestURI.toString().indexOf("?") > 0) {
                setRequestURI(requestURI + "&" + paramsStr);
            } else {
                setRequestURI(requestURI + "?" + paramsStr);
            }
        }
        return new HttpExplore(this);
    }

    /**
     * 将数据参数转换为字符串
     * @return  转换后的字符串参数
     */
    protected String paramsToString(){
        StringJoiner sj = new StringJoiner("&");
        if (requestParams != null && requestParams.size() > 0){
            requestParams.forEach((k,v) -> sj.add(k + "=" + v.toString()));
            return sj.toString();
        }
        return null;
    }

}

 请求工具封装

配置有了,现在就开始封装工具,工具我们就封装几个常用的方案即可,如:GET请求、POST请求、PUT请求、DELETE请求等,请求结果我们以字符串形式获取和以文件形式获取两种即可(以后如果有需要,请自行添加)

HttpExplore.java

package com.vtarj.pythagoras.explore;

import java.io.File;
import java.io.IOException;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.nio.file.Path;
import java.time.Instant;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;

/**
 * @Author Vtarj
 * @Description Http探测器
 * @Time 2022/4/1 9:42
 **/
public class HttpExplore {

    //统一通道管理,节省资源
    private static HttpClient client;
    private final HttpConfig config;

    public HttpExplore(HttpConfig config){
        this.config = config;
        //单例模式
        if (client == null || !config.isLocked()){
            synchronized (HttpExplore.class){
                //设置构造者必须参数
                HttpClient.Builder builder = HttpClient.newBuilder();
                builder.version(config.getVersion());
                builder.followRedirects(config.getRedirect());
                builder.connectTimeout(config.getConnectTimeout());
                builder.priority(config.getPriority());
                //设置构造者非必须参数
                Optional.ofNullable(config.getExecutor()).ifPresent(builder::executor);
                Optional.ofNullable(config.getAuthenticator()).ifPresent(builder::authenticator);
                Optional.ofNullable(config.getCookieHandler()).ifPresent(builder::cookieHandler);
                Optional.ofNullable(config.getProxySelector()).ifPresent(builder::proxy);
                Optional.ofNullable(config.getSslContext()).ifPresent(builder::sslContext);
                //构建HttpClient
                client = builder.build();
            }
        }
    }


    /**
     * 构建请求信息
     */
    private HttpRequest buildRequest() {
        HttpRequest.Builder builder = HttpRequest.newBuilder();
        builder.uri(config.getRequestURI());
        builder.uri(config.getRequestURI());
        builder.timeout(config.getConnectTimeout());
        builder.headers(buildHeader());
        builder.version(config.getVersion());
        builder.method(config.getRequestMethod(),buildPublisher());
        return builder.build();
    }

    /**
     * 执行请求,结果以字符串形式响应
     * @return  响应结果
     */
    public HttpResult<String> executeToString() throws IOException, InterruptedException {
        HttpRequest request = buildRequest();
        HashMap<String,Object> options = new HashMap<>();
        options.put("startime", Instant.now());
        HttpResponse<String> response = client.send(request,HttpResponse.BodyHandlers.ofString(config.getResCode()));
        options.put("endtime",Instant.now());
        return new HttpResult<>(response.statusCode(), response.body(), client, request, response, options);
    }

    /**
     * 执行请求,结果以文件形式响应
     * @param pathStr   文件保存路径
     * @return  响应结果
     */
    public HttpResult<File> executeToFile(String pathStr) throws IOException, InterruptedException {
        File file = new File(pathStr);
        if (!file.exists()){
            file.getParentFile().mkdirs();
        }
        HttpRequest request = buildRequest();
        HttpResponse<Path> response = client.send(buildRequest(),HttpResponse.BodyHandlers.ofFile(file.toPath()));
        return new HttpResult<>(response.statusCode(), file, client, request, response);
    }

    /**
     * 构建头信息
     * @return  头信息
     */
    private String[] buildHeader(){
        Map<String,String> headerMap = config.getHeaderMap();
        String[] headers = new String[headerMap.size() * 2];
        int index = 0;
        for (Map.Entry<String,String> entry:
                headerMap.entrySet()) {
            headers[index++] = entry.getKey();
            headers[index++] = entry.getValue();
        }
        return headers;
    }

    /**
     * 将提交参数转换为Publisher
     * @return 转换后的Publisher
     */
    private HttpRequest.BodyPublisher buildPublisher(){
        String paramsStr = config.paramsToString();
        if (paramsStr != null) {
            return HttpRequest.BodyPublishers.ofString(paramsStr,config.getReqCode());
        }
        return HttpRequest.BodyPublishers.noBody();
    }



    /**
     * 创建HttpExplore的构造器,初始化相关配置
     * @return  初始化HttpConfig配置
     */
    public static HttpConfig builder(){
        return new HttpConfig();
    }
}

发现了嘛?HttpExplore和HttpConfig紧密结合,谁也离不开谁,这样我们在使用工具时就自然而然的去先做配置,这就是构造者模型的好处。

另外,JDK的HttpClient的优势在于,无论时get还是post或者put或者delete,其实就是一个参数,传参的方式也高度一致,是不是很方便?

此外,特别要注意的是字符编码问题,请求编码和响应编码一定要记得设置,否则乱码问题很头疼。 

响应封装

请求工具封装完成,我们一般就可以获取远程站点内容,但是如何解析内容也是一个问题(发现没有,我们至今除了使用JDK外,没有使用过其他任何第三方工具包),因此为了方便,我们这里使用Jsoup来解析响应的HTML内容。Jsoup可以将HTML内容转换成节点,我们直接可以获取节点内容,方便快捷、干净又卫生。

HttpResult.java

package com.vtarj.pythagoras.explore;

import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.util.HashMap;

/**
 * @Author Vtarj
 * @Description 规范化响应结果
 * @Time 2022/4/2 11:07
 **/
public class HttpResult<T> {
    private final int code;
    private final T data;
    private final HttpClient client;
    private final HttpRequest request;
    private final HttpResponse response;
    private final HashMap<String,Object> options;

    public HttpResult(int code, T data, HttpClient client, HttpRequest request, HttpResponse response, HashMap<String, Object> options) {
        this.code = code;
        this.data = data;
        this.client = client;
        this.request = request;
        this.response = response;
        this.options = options;
    }

    public HttpResult(int code, T data, HttpClient client, HttpRequest request, HttpResponse response) {
        this.code = code;
        this.data = data;
        this.client = client;
        this.request = request;
        this.response = response;
        this.options = null;
    }

    public int getCode() {
        return code;
    }

    public T getData() {
        return data;
    }

    public HttpClient getClient() {
        return client;
    }

    public HttpRequest getRequest() {
        return request;
    }

    public HttpResponse getResponse() {
        return response;
    }

    public HashMap<String, Object> getOptions() {
        return options;
    }

    @Override
    public String toString() {
        return "HttpResult{" +
                "code=" + code +
                ", data=" + data +
                ", request=" + request +
                ", response=" + response +
                '}';
    }
}

响应结果,我们尽可能也保留原文内容,这样防止我们封装时考虑不周导致后面相关信息遗失,因此建议增加一个可自定义的option字段,用于自定义存放内容。

未完待续~~~

上一篇:实战:纯手工打造Java爬虫——基于JDK11原生HttpClient(二)

下一篇:实战:纯手工打造Java爬虫——基于JDK11原生HttpClient(四)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值