httpclient网络爬虫常用方法

创建httpclient 工具类

添加httpclient 使用的版本是4.5.8,使httpclent进行爬虫,创建一个工具类获取网页文本

 		<dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.8</version>
        </dependency>
import lombok.extern.slf4j.Slf4j;
import org.apache.http.HttpEntityEnclosingRequest;
import org.apache.http.HttpRequest;
import org.apache.http.NoHttpResponseException;
import org.apache.http.client.HttpRequestRetryHandler;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.conn.ConnectTimeoutException;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;

import javax.net.ssl.SSLException;
import javax.net.ssl.SSLHandshakeException;
import java.io.IOException;
import java.io.InterruptedIOException;
import java.net.UnknownHostException;

/**
 * @author felix
 * @ 日期 2019-06-12 13:30
 */
@Slf4j
@Component
public class HttpUtils {
    private PoolingHttpClientConnectionManager pm;

    public HttpUtils() {
        pm = new PoolingHttpClientConnectionManager();
        pm.setDefaultMaxPerRoute(100);
        pm.setMaxTotal(2000);
    }

    public String getHtml(String url) throws IOException {
        RequestConfig requestConfig = RequestConfig.custom()
                .setConnectionRequestTimeout(6000)
                .setConnectTimeout(6000)
                .setSocketTimeout(6000).build();


        // 请求重试处理
        HttpRequestRetryHandler retryHandler = (exception, executionCount, context) -> {

            log.info("重试次数:{}", executionCount);
            // 如果已经重试了10次,就放弃
            if (executionCount >= 10) {
                log.info("重试大于10次");
                return false;
            }
            // 如果服务器丢掉了连接,那么就重试
            if (exception instanceof NoHttpResponseException) {
                return true;
            }
            // 不要重试SSL握手异常
            if (exception instanceof SSLHandshakeException) {
                return false;
            }
            // 超时
            if (exception instanceof InterruptedIOException) {
                return true;
            }
            // 目标服务器不可达
            if (exception instanceof UnknownHostException) {
                return true;
            }
            // 连接被拒绝
            if (exception instanceof ConnectTimeoutException) {
                return false;
            }
            // SSL握手异常
            if (exception instanceof SSLException) {
                return false;
            }

            HttpClientContext clientContext = HttpClientContext.adapt(context);
            HttpRequest request = clientContext.getRequest();
            // 如果请求是幂等的,就再次尝试
            if (!(request instanceof HttpEntityEnclosingRequest)) {
                return true;
            }
            return false;
        };

        CloseableHttpClient client = HttpClients.custom()
                .setConnectionManager(pm)
                .setDefaultRequestConfig(requestConfig)
                .setRetryHandler(retryHandler)
                .build();

        HttpGet httpGet = new HttpGet(url);
        //设置请求头模拟浏览器
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36");

        CloseableHttpResponse response = client.execute(httpGet);

        if (response.getStatusLine().getStatusCode() == 200) {
            if (response.getEntity() != null) {
                String content = EntityUtils.toString(response.getEntity(), "gbk");
                return content;
            }
        }
        response.close();
        return "";
    }
}

使用方式

import com.felix.pro.demo.model.Province;
import com.felix.pro.demo.repository.ProvinceRepository;
import com.felix.pro.demo.util.HttpUtils;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * @author felix
 * @ 日期 2019-06-12 14:06
 */
@Slf4j
@Component
public class CrawlProvince {

    @Autowired
    private HttpUtils httpUtils;
    @Autowired
    private ProvinceRepository provinceRepository;

    public void province() throws IOException, InterruptedException {
        String html = httpUtils.getHtml("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html");
        Document document = Jsoup.parse(html);
        Elements select = document.select(".provincetable .provincetr td");
        List<Province> list = new ArrayList<>();
        int count = 0;
        for (Element link : select) {
            Province province = new Province();
            Elements a = link.select("a");
            for (Element s : a) {
                String linkHref = s.attr("href");
                String[] split = linkHref.split("\\D");
                for (int i = 0; i < split.length; i++) {
                    log.info("ProvinceCode:{}", split[i]);
                    province.setProvinceCode(split[i]);
                    province.setSort(Integer.parseInt(split[i]));
                }
            }
            String provinceName = link.text();
            log.info("provinceName:{}", provinceName);
            province.setProvinceName(provinceName);
            list.add(province);
            count++;
        }

        for (Province province : list) {
            provinceRepository.save(province);
        }
        log.info("完成共: {} 条数据", count);
    }

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值