jsoup爬取网页信息
/**
* 爬取网页数据
* url商品链接地址
*/
public AjaxResult parseSearchShoes(String url) {
try {
logger.info("【传入的url链接:】" + url);
//解析Url获取Document对象
Document document = Jsoup.connect(url).get();
//获取网页源码文本内容
logger.info("【获取网页源码】" + document.toString());
catch (Exception e){
logger.info("【爬取失败:】" + e.getMessage());
}
return AjaxResult.sucess();
}
问题:实际在爬取的过程中发现返回的页面源码有的时候是完整的,有的时候是不完整的。这可能是页面懒加载导致的,所有我们需要爬取页面的时候也是需要延迟加载。可以利用htmlunit 技术。
htmlunit 爬取页面信息
public Document htmlunitMethod(String url) throws IOException {
//构造一个webClient 模拟Chrome 浏览器
WebClient webClient = new WebClient(BrowserVersion.CHROME);
//支持JavaScript
webClient.getOptions().setJavaScriptEnabled(true);
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setActiveXNative(false);
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.getOptions().setTimeout(5000);
HtmlPage rootPage = webClient.getPage(url);
//设置一个运行JavaScript的时间
webClient.waitForBackgroundJavaScript(5000);
String html = rootPage.asXml();
Document document = Jsoup.parse(html);
return document;
}
pom依赖
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.2</version>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.25</version>
</dependency>