创建httpclient 工具类
添加httpclient 使用的版本是4.5.8,使httpclent进行爬虫,创建一个工具类获取网页文本
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.8</version>
</dependency>
import lombok.extern.slf4j.Slf4j;
import org.apache.http.HttpEntityEnclosingRequest;
import org.apache.http.HttpRequest;
import org.apache.http.NoHttpResponseException;
import org.apache.http.client.HttpRequestRetryHandler;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.conn.ConnectTimeoutException;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import javax.net.ssl.SSLException;
import javax.net.ssl.SSLHandshakeException;
import java.io.IOException;
import java.io.InterruptedIOException;
import java.net.UnknownHostException;
/**
* @author felix
* @ 日期 2019-06-12 13:30
*/
@Slf4j
@Component
public class HttpUtils {
private PoolingHttpClientConnectionManager pm;
public HttpUtils() {
pm = new PoolingHttpClientConnectionManager();
pm.setDefaultMaxPerRoute(100);
pm.setMaxTotal(2000);
}
public String getHtml(String url) throws IOException {
RequestConfig requestConfig = RequestConfig.custom()
.setConnectionRequestTimeout(6000)
.setConnectTimeout(6000)
.setSocketTimeout(6000).build();
// 请求重试处理
HttpRequestRetryHandler retryHandler = (exception, executionCount, context) -> {
log.info("重试次数:{}", executionCount);
// 如果已经重试了10次,就放弃
if (executionCount >= 10) {
log.info("重试大于10次");
return false;
}
// 如果服务器丢掉了连接,那么就重试
if (exception instanceof NoHttpResponseException) {
return true;
}
// 不要重试SSL握手异常
if (exception instanceof SSLHandshakeException) {
return false;
}
// 超时
if (exception instanceof InterruptedIOException) {
return true;
}
// 目标服务器不可达
if (exception instanceof UnknownHostException) {
return true;
}
// 连接被拒绝
if (exception instanceof ConnectTimeoutException) {
return false;
}
// SSL握手异常
if (exception instanceof SSLException) {
return false;
}
HttpClientContext clientContext = HttpClientContext.adapt(context);
HttpRequest request = clientContext.getRequest();
// 如果请求是幂等的,就再次尝试
if (!(request instanceof HttpEntityEnclosingRequest)) {
return true;
}
return false;
};
CloseableHttpClient client = HttpClients.custom()
.setConnectionManager(pm)
.setDefaultRequestConfig(requestConfig)
.setRetryHandler(retryHandler)
.build();
HttpGet httpGet = new HttpGet(url);
//设置请求头模拟浏览器
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36");
CloseableHttpResponse response = client.execute(httpGet);
if (response.getStatusLine().getStatusCode() == 200) {
if (response.getEntity() != null) {
String content = EntityUtils.toString(response.getEntity(), "gbk");
return content;
}
}
response.close();
return "";
}
}
使用方式
import com.felix.pro.demo.model.Province;
import com.felix.pro.demo.repository.ProvinceRepository;
import com.felix.pro.demo.util.HttpUtils;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* @author felix
* @ 日期 2019-06-12 14:06
*/
@Slf4j
@Component
public class CrawlProvince {
@Autowired
private HttpUtils httpUtils;
@Autowired
private ProvinceRepository provinceRepository;
public void province() throws IOException, InterruptedException {
String html = httpUtils.getHtml("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html");
Document document = Jsoup.parse(html);
Elements select = document.select(".provincetable .provincetr td");
List<Province> list = new ArrayList<>();
int count = 0;
for (Element link : select) {
Province province = new Province();
Elements a = link.select("a");
for (Element s : a) {
String linkHref = s.attr("href");
String[] split = linkHref.split("\\D");
for (int i = 0; i < split.length; i++) {
log.info("ProvinceCode:{}", split[i]);
province.setProvinceCode(split[i]);
province.setSort(Integer.parseInt(split[i]));
}
}
String provinceName = link.text();
log.info("provinceName:{}", provinceName);
province.setProvinceName(provinceName);
list.add(province);
count++;
}
for (Province province : list) {
provinceRepository.save(province);
}
log.info("完成共: {} 条数据", count);
}
}