Java——HttpClient爬取网页,jsoup解析网页
文章目录
1、所用技术:Httpclient
HttpClient 是Apache Jakarta Common 下的子项目,可以用来提供高效的、最新的、功能丰富的支持 HTTP 协议的客户端编程工具包,并且它支持 HTTP 协议最新的版本和建议。
2、引入依赖
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.8</version>
</dependency>
3、新建一个测试类
import java.io.IOException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.ArrayList;
import java.util.List;
import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* @author ZWB
*/
public class HttpClientT4 {
//设置请求访问的ssl证书
private static SSLContext ctx;
static {
try {
ctx = SSLContext.getInstance("TLS");
X509TrustManager tm = new X509TrustManager() {
public X509Certificate[] getAcceptedIssuers() {
return null;
}
public void checkClientTrusted(X509Certificate[] arg0,
String arg1) throws CertificateException {
}
public void checkServerTrusted(X509Certificate[] arg0,
String arg1) throws CertificateException {
}
};
ctx.init(null, new TrustManager[] { tm }, null);
} catch (Exception e) {
System.out.println(e);
}
}
public static void main(String[] args) throws Exception {
String url="https://taolitop.com";
String html = HttpClientT4.resultHtml(url);
System.out.println(html);
}
/**
* 返回Html
* @param url
* @return
* @throws Exception
*/
public static String resultHtml (String url) throws Exception{
// 在这儿加了 setSSLContext(ctx)
String result = "爬取错误!";
CloseableHttpClient httpClient = HttpClientBuilder.create().setSSLContext(ctx).build();
//创建get请求
HttpGet request = new HttpGet(url);
try {
HttpResponse response = httpClient.execute(request);
result = EntityUtils.toString(response.getEntity(), "utf-8");
//System.out.println(result);
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return result;
}
}
执行上面的代码,会得到一个完整的html的代码:
你会看到一个完整的网页
原因是这个网站没有设置反爬虫,返回状态不是200,所以我们没有怎么设置就轻松获取了相关的资源,
如果对应的网站有识别了爬虫程序,怎么办?
爬下面的网站,就会发现问题
https://www.tuicool.com/
爬虫程序被识别了,我们该怎么解决呢?
4、复杂的爬虫应用
4.1、对请求头进行伪装
伪装成浏览器,其实如果你伪装了之后,如果短时间内一直多次访问的话,网站会对你的ip进行封杀,这个时候就需要换个ip地址了,使用代理IP
4.2、使用代理IP
网上有一些免费的代理ip网站,比如xici
我们选择那些存活时间久并且刚刚被验证的ip,我这里选择了“112.85.168.223:9999”,代码如下
4.3、创建get请求,伪装成浏览器访问
把 HttpGet request = new HttpGet(url); 换成如下代码:
HttpGet request = new HttpGet(url);
request.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36");
//如果你伪装了之后,如果短时间内一直多次访问的话,网站会对你的ip进行封杀,这个时候就需要换个ip地址了,使用代理IP
HttpHost proxy = new HttpHost("112.85.168.223", 9999);
RequestConfig config = RequestConfig.custom().setProxy(proxy).build();
request.setConfig(config);
全部代码
import java.io.IOException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.ArrayList;
import java.util.List;
import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* @author ZWB
*/
public class HttpClientT4 {
//设置请求访问的ssl证书
private static SSLContext ctx;
static {
try {
ctx = SSLContext.getInstance("TLS");
X509TrustManager tm = new X509TrustManager() {
public X509Certificate[] getAcceptedIssuers() {
return null;
}
public void checkClientTrusted(X509Certificate[] arg0,
String arg1) throws CertificateException {
}
public void checkServerTrusted(X509Certificate[] arg0,
String arg1) throws CertificateException {
}
};
ctx.init(null, new TrustManager[] { tm }, null);
} catch (Exception e) {
System.out.println(e);
}
}
public static void main(String[] args) throws Exception {
String url="https://taolitop.com";
String html = HttpClientT4.resultHtml(url);
System.out.println(html);
}
/**
* 返回Html
* @param url
* @return
* @throws Exception
*/
public static String resultHtml (String url) throws Exception{
// 在这儿加了 setSSLContext(ctx)
String result = "爬取错误!";
CloseableHttpClient httpClient = HttpClientBuilder.create().setSSLContext(ctx).build();
//创建get请求
HttpGet request = new HttpGet(url);
request.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36");
//如果你伪装了之后,如果短时间内一直多次访问的话,网站会对你的ip进行封杀,这个时候就需要换个ip地址了,使用代理IP
HttpHost proxy = new HttpHost("112.85.168.223", 9999);
RequestConfig config = RequestConfig.custom().setProxy(proxy).build();
request.setConfig(config);
try {
HttpResponse response = httpClient.execute(request);
result = EntityUtils.toString(response.getEntity(), "utf-8");
//System.out.println(result);
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return result;
}
}
执行代码,能正常返回html结果。
如果代理ip刚好不能用的话,会报错,如下显示连接超时,这个时候需要更换一个新的代理ip
4.4、访问失败的其他原因
系统被识别出来爬虫的很大原因是短时间内对系统进行了大量的访问,因此我们也可以放慢爬取的速度,让程序sleep一段时间再爬下一个也是一种解决反爬虫的简单方法。
5、使用jsoup解析html
jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,
可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。
详情可以参考博客Jsoup详解
5.1、导入依赖
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
5.2、解析网页
以爬取https://taolitop.com/为例,在这之前我们已经有了该网页的html,例如我们将解析该网页的全部图片,我们只需要利用Jsoup去解析图片,添加对应代码即可:
Document parse = Jsoup.parse(result);
Elements links = parse.getElementsByTag("img");
for (Element link : links) {
String linkHref = link.attr("src");
linkHref = "https://taolitop.com" + linkHref;
imgList.add(linkHref);
}
完整代码:
import java.io.IOException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.ArrayList;
import java.util.List;
import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* @author shifengqiang 2022/3/15 6:03 PM
*/
public class HttpClientT4 {
//设置请求访问的ssl证书
private static SSLContext ctx;
static {
try {
ctx = SSLContext.getInstance("TLS");
X509TrustManager tm = new X509TrustManager() {
public X509Certificate[] getAcceptedIssuers() {
return null;
}
public void checkClientTrusted(X509Certificate[] arg0,
String arg1) throws CertificateException {
}
public void checkServerTrusted(X509Certificate[] arg0,
String arg1) throws CertificateException {
}
};
ctx.init(null, new TrustManager[] { tm }, null);
} catch (Exception e) {
System.out.println(e);
}
}
public static void main(String[] args) throws Exception {
List<String> strings = HttpClientT4.resultImgUrl(url);
for (String imgUrl : strings) {
System.out.println(imgUrl);
}
}
/**
* 返回图片url
* @param url
* @return
* @throws Exception
*/
public static List<String> resultImgUrl (String url) throws Exception{
// 在这儿加了 setSSLContext(ctx)
String result = "爬取错误!";
List<String> imgList=new ArrayList<>();
CloseableHttpClient httpClient = HttpClientBuilder.create().setSSLContext(ctx).build();
//创建get请求
HttpGet request = new HttpGet(url);
try {
HttpResponse response = httpClient.execute(request);
result = EntityUtils.toString(response.getEntity(), "utf-8");
Document parse = Jsoup.parse(result);
Elements links = parse.getElementsByTag("img");
for (Element link : links) {
String linkHref = link.attr("src");
linkHref = "https://taolitop.com" + linkHref;
imgList.add(linkHref);
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return imgList;
}
}
此时就获取到了该网页的全部图片资源:
我们可以点击该网址直接打开。
同时:
jsoup也可以解析其他的内容,也可以根据class或者id解析都可,具体深入学习可以参考jsoup详解博客。