<!--解析页面元素-->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>
<!--htmlunit解析js加载的动态数据页面--> https://mvnrepository.com/artifact/net.sourceforge.htmlunit/htmlunit -->
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.33</version>
</dependency>
<!--解析js加载的动态数据页面-->
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>2.44.0</version>
</dependency>
<!--解析验证码,识图-->
<dependency>
<groupId>net.sourceforge.tess4j</groupId>
<artifactId>tess4j</artifactId>
<version>4.5.3</version>
</dependency>
发出请求的方式
一、直接通过jsoup发起请求,缺点是不能爬取js动态加载的数据;
/**
* 返回:Elements(标签下的内容)
* 参数:url
*/
public static Elements getDoc(String HomeUrl, String divClassName) throws IOException {
Document doc;
try {
SslUtils.ignoreSsl();
} catch (Exception e) {
e.printStackTrace();
}
doc = Jsoup.connect(HomeUrl).userAgent(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36")
.get();
Elements select = doc.select(divClassName);
return select;
}
二、根据htmlunit发起请求,可以解决上面的缺点,主要是模拟浏览器的环境
public static String htmlJsUtils(String url) {
URL url1 = null;
System.out.println("Loading page now-----------------------------------------------: " + url);
// HtmlUnit 模拟浏览器
WebClient webClient = new WebClient(BrowserVersion.CHROME);
webClient.getOptions().setJavaScriptEnabled(true); // 启用JS解释器,默认为true
webClient.getOptions().setCssEnabled(false); // 禁用css支持
webClient.getOptions().setThrowExceptionOnScriptError(false); // js运行错误时,是否抛出异常
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.getOptions().setTimeout(10 * 1000); // 设置连接超时时间
try {
url1 = new URL(url);
} catch (MalformedURLException e) {
e.printSta