webmagic ajax,【WebMagic】抓取前端渲染的页面

随着AJAX技术不断的普及,以及现在AngularJS这种Single-page application框架的出现,现在js渲染出的页面越来越多。对于爬虫来说,这种页面是比较讨厌的:仅仅提取HTML内容,往往无法拿到有效的信息。那么如何处理这种页面呢?总的来说有两种做法:

在抓取阶段,在爬虫中内置一个浏览器内核,执行js渲染页面后,再抓取。这方面对应的工具有Selenium、HtmlUnit或者PhantomJs。但是这些工具都存在一定的效率问题,同时也不是那么稳定。好处是编写规则同静态页面一样。

因为js渲染页面的数据也是从后端拿到,而且基本上都是AJAX获取,所以分析AJAX请求,找到对应数据的请求,也是比较可行的做法。而且相对于页面样式,这种接口变化可能性更小。缺点就是找到这个请求,并进行模拟,是一个相对困难的过程,也需要相对多的分析经验。

方法一是通过执行js渲染页面获取静态页面来抓取数据,而方法二则是找规律找到目标数据的请求URL来获取数据。

本次着重讲解方法一,这种方式相对来说较为简单。

环境安装

项目搭建

我使用的是SpringBoot框架,项目的目录结构如下:

dcc2dd4f1505

crawler.png

各源文件如下:

pom.xml

xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"

xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">

4.0.0

org.springframework.boot

spring-boot-starter-parent

2.4.0

com.born2do

crawler

1.0-SNAPSHOT

8

8

0.7.4

org.springframework.boot

spring-boot-starter-web

org.springframework.boot

spring-boot-starter-data-jpa

org.springframework.boot

spring-boot-starter-test

mysql

mysql-connector-java

8.0.15

us.codecraft

webmagic-core

${webmagic.version}

us.codecraft

webmagic-extension

${webmagic.version}

com.google.guava

guava

30.0-jre

org.apache.commons

commons-lang3

3.9

com.kotcrab.remark

remark

1.0.0

org.seleniumhq.selenium

selenium-java

2.33.0

CrawlerOnSpringBoot2Doc.java

package com.born2do.task;

import com.born2do.webmagic.downloader.selenium.SeleniumDownloader;

import com.overzealous.remark.Remark;

import org.springframework.scheduling.annotation.Scheduled;

import org.springframework.stereotype.Component;

import us.codecraft.webmagic.Page;

import us.codecraft.webmagic.Site;

import us.codecraft.webmagic.Spider;

import us.codecraft.webmagic.processor.PageProcessor;

import java.io.IOException;

import java.nio.file.Files;

import java.nio.file.Path;

import java.nio.file.Paths;

import java.util.List;

/**

* 获取《SpringBoot2核心技术与响应式编程》教学文档

* 说明:“语雀”前端页面较为复杂,数据有多层加密,且主要由Ajax渲染页面数据,在这里并没有对代码块以及思维导图进行进一步处理,需人工处理

*

* @author chenhy

* @date 2021/3/22

*/

@Component

public class CrawlerOnSpringBoot2Doc implements PageProcessor {

private static final String website = "https://www.yuque.com/atguigu/springboot";

private static final String file = "D:\\SpringBoot2核心技术与响应式编程.md";

private Site site = Site.me().setCharset("UTF8") // 编码格式

.setTimeOut(1000 * 30) // 超时时间

.setRetrySleepTime(1000 * 5) // 重试时间间隔

.setRetryTimes(10); // 重试次数

@Override

public void process(Page page) {

// 获取所有目录的url,并加入爬虫队列中

if (website.equals(page.getUrl().toString())) {

List urls = page.getHtml().xpath("//span[@class='name']").links().all();

for (String url : urls) {

page.addTargetRequest(url);

}

} else {

// 获取内容

String content = page.getHtml().xpath("//div[@class='index-module_title_1s0gC']").toString()

+ page.getHtml().xpath("//div[@class='yuque-doc-content']").toString();

// 将获取到的内容从HTML格式转换为Markdown格式

Remark remark = new Remark();

content = remark.convert(content);

page.putField("content", content);

System.out.println(page.getUrl() + " download over!");

}

}

@Override

public Site getSite() {

return site;

}

@Scheduled(fixedDelay = 1000 * 60 * 60 * 24)

private void mainProcess() throws IOException {

Path filePath = Paths.get(file);

if (Files.exists(filePath)) {

Files.delete(filePath);

}

Files.createFile(filePath);

Spider.create(new CrawlerOnSpringBoot2Doc())

.addUrl(website)

.setDownloader(new SeleniumDownloader("D:\\chromedriver_win32\\chromedriver.exe"))

.thread(1)

.addPipeline(new MarkdownPipeLine(filePath))

.run();

System.out.println("process is over!");

}

}

MarkdownPipeLine.java

package com.born2do.task;

import org.springframework.stereotype.Component;

import us.codecraft.webmagic.ResultItems;

import us.codecraft.webmagic.Task;

import us.codecraft.webmagic.pipeline.Pipeline;

import java.io.IOException;

import java.nio.charset.StandardCharsets;

import java.nio.file.Files;

import java.nio.file.Path;

import java.nio.file.StandardOpenOption;

import java.util.Map;

/**

* @author chenhy

* @date 2021/3/21

*/

public class MarkdownPipeLine implements Pipeline {

private Path filePath;

public MarkdownPipeLine(Path filePath) {

this.filePath = filePath;

}

@Override

public void process(ResultItems resultItems, Task task) {

Map result = resultItems.getAll();

String content = (String) result.get("content");

try {

Files.write(filePath, content.getBytes(StandardCharsets.UTF_8), StandardOpenOption.APPEND);

// 换行

Files.write(filePath, System.getProperty("line.separator").getBytes(StandardCharsets.UTF_8), StandardOpenOption.APPEND);

} catch (IOException e) {

e.printStackTrace();

}

}

}

SeleniumDownloader.java

package com.born2do.webmagic.downloader.selenium;

import org.openqa.selenium.By;

import org.openqa.selenium.Cookie;

import org.openqa.selenium.WebDriver;

import org.openqa.selenium.WebElement;

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

import us.codecraft.webmagic.Page;

import us.codecraft.webmagic.Request;

import us.codecraft.webmagic.Site;

import us.codecraft.webmagic.Task;

import us.codecraft.webmagic.downloader.Downloader;

import us.codecraft.webmagic.selector.Html;

import us.codecraft.webmagic.selector.PlainText;

import java.io.Closeable;

import java.io.IOException;

import java.util.Map;

/**

* 使用Selenium调用浏览器进行渲染。目前仅支持chrome。

* 需要下载Selenium driver支持。

*

* @author code4crafter@gmail.com

* Date: 13-7-26

* Time: 下午1:37

*/

public class SeleniumDownloader implements Downloader, Closeable {

private volatile WebDriverPool webDriverPool;

private Logger logger = LoggerFactory.getLogger(getClass());

private int sleepTime = 0;

private int poolSize = 1;

private static final String DRIVER_PHANTOMJS = "phantomjs";

/**

* 新建

*

* @param chromeDriverPath chromeDriverPath

*/

public SeleniumDownloader(String chromeDriverPath) {

System.getProperties().setProperty("webdriver.chrome.driver",

chromeDriverPath);

}

/**

* Constructor without any filed. Construct PhantomJS browser

*

* @author bob.li.0718@gmail.com

*/

public SeleniumDownloader() {

// System.setProperty("phantomjs.binary.path",

// "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");

}

/**

* set sleep time to wait until load success

*

* @param sleepTime sleepTime

* @return this

*/

public SeleniumDownloader setSleepTime(int sleepTime) {

this.sleepTime = sleepTime;

return this;

}

@Override

public Page download(Request request, Task task) {

checkInit();

WebDriver webDriver;

try {

webDriver = webDriverPool.get();

} catch (InterruptedException e) {

logger.warn("interrupted", e);

return null;

}

logger.info("downloading page " + request.getUrl());

webDriver.get(request.getUrl());

try {

Thread.sleep(sleepTime);

} catch (InterruptedException e) {

e.printStackTrace();

}

WebDriver.Options manage = webDriver.manage();

Site site = task.getSite();

if (site.getCookies() != null) {

for (Map.Entry cookieEntry : site.getCookies()

.entrySet()) {

Cookie cookie = new Cookie(cookieEntry.getKey(),

cookieEntry.getValue());

manage.addCookie(cookie);

}

}

/*

* TODO You can add mouse event or other processes

*

* @author: bob.li.0718@gmail.com

*/

WebElement webElement = webDriver.findElement(By.xpath("/html"));

String content = webElement.getAttribute("outerHTML");

Page page = new Page();

page.setRawText(content);

page.setHtml(new Html(content, request.getUrl()));

page.setUrl(new PlainText(request.getUrl()));

page.setRequest(request);

webDriverPool.returnToPool(webDriver);

return page;

}

private void checkInit() {

if (webDriverPool == null) {

synchronized (this) {

webDriverPool = new WebDriverPool(poolSize);

}

}

}

@Override

public void setThread(int thread) {

this.poolSize = thread;

}

@Override

public void close() throws IOException {

webDriverPool.closeAll();

}

}

WebDriverPool.java

package com.born2do.webmagic.downloader.selenium;

import org.openqa.selenium.WebDriver;

import org.openqa.selenium.chrome.ChromeDriver;

import org.openqa.selenium.firefox.FirefoxDriver;

import org.openqa.selenium.remote.DesiredCapabilities;

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

import java.io.FileReader;

import java.io.IOException;

import java.net.MalformedURLException;

import java.net.URL;

import java.util.ArrayList;

import java.util.Collections;

import java.util.List;

import java.util.Properties;

import java.util.concurrent.BlockingDeque;

import java.util.concurrent.LinkedBlockingDeque;

import java.util.concurrent.atomic.AtomicInteger;

/**

* @author code4crafter@gmail.com

* Date: 13-7-26

* Time: 下午1:41

*/

class WebDriverPool {

private Logger logger = LoggerFactory.getLogger(getClass());

private final static int DEFAULT_CAPACITY = 5;

private final int capacity;

private final static int STAT_RUNNING = 1;

private final static int STAT_CLODED = 2;

private AtomicInteger stat = new AtomicInteger(STAT_RUNNING);

private WebDriver mDriver = null;

private static final String DEFAULT_CONFIG_FILE = "/config.ini";

private static final String DRIVER_FIREFOX = "firefox";

private static final String DRIVER_CHROME = "chrome";

protected static Properties sConfig;

protected static DesiredCapabilities sCaps;

/**

* Configure the GhostDriver, and initialize a WebDriver instance. This part

* of code comes from GhostDriver.

* https://github.com/detro/ghostdriver/tree/master/test/java/src/test/java/ghostdriver

*

* @author bob.li.0718@gmail.com

* @throws IOException

*/

public void configure() throws IOException {

// Read config file

sConfig = new Properties();

// String configFile = DEFAULT_CONFIG_FILE;

String configFile = this.getClass().getResource(DEFAULT_CONFIG_FILE).getPath();

if (System.getProperty("selenuim_config")!=null){

configFile = System.getProperty("selenuim_config");

}

sConfig.load(new FileReader(configFile));

// Prepare capabilities

sCaps = new DesiredCapabilities();

sCaps.setJavascriptEnabled(true);

sCaps.setCapability("takesScreenshot", false);

String driver = sConfig.getProperty("driver", DRIVER_CHROME);

ArrayList cliArgsCap = new ArrayList();

cliArgsCap.add("--web-security=false");

cliArgsCap.add("--ssl-protocol=any");

cliArgsCap.add("--ignore-ssl-errors=true");

// Start appropriate Driver

if (driver.equals(DRIVER_FIREFOX)) {

mDriver = new FirefoxDriver(sCaps);

} else if (driver.equals(DRIVER_CHROME)) {

mDriver = new ChromeDriver(sCaps);

}

}

/**

* check whether input is a valid URL

*

* @author bob.li.0718@gmail.com

* @param urlString urlString

* @return true means yes, otherwise no.

*/

private boolean isUrl(String urlString) {

try {

new URL(urlString);

return true;

} catch (MalformedURLException mue) {

return false;

}

}

/**

* store webDrivers created

*/

private List webDriverList = Collections

.synchronizedList(new ArrayList());

/**

* store webDrivers available

*/

private BlockingDeque innerQueue = new LinkedBlockingDeque();

public WebDriverPool(int capacity) {

this.capacity = capacity;

}

public WebDriverPool() {

this(DEFAULT_CAPACITY);

}

/**

*

* @return

* @throws InterruptedException

*/

public WebDriver get() throws InterruptedException {

checkRunning();

WebDriver poll = innerQueue.poll();

if (poll != null) {

return poll;

}

if (webDriverList.size() < capacity) {

synchronized (webDriverList) {

if (webDriverList.size() < capacity) {

// add new WebDriver instance into pool

try {

configure();

innerQueue.add(mDriver);

webDriverList.add(mDriver);

} catch (IOException e) {

e.printStackTrace();

}

}

}

}

return innerQueue.take();

}

public void returnToPool(WebDriver webDriver) {

checkRunning();

innerQueue.add(webDriver);

}

protected void checkRunning() {

if (!stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) {

throw new IllegalStateException("Already closed!");

}

}

public void closeAll() {

boolean b = stat.compareAndSet(STAT_RUNNING, STAT_CLODED);

if (!b) {

throw new IllegalStateException("Already closed!");

}

for (WebDriver webDriver : webDriverList) {

logger.info("Quit webDriver" + webDriver);

webDriver.quit();

webDriver = null;

}

}

}

config.ini

# What WebDriver to use for the tests

#driver=firefox

driver=chrome

#谷歌浏览器启动程序路径

#chrome_exec_path=D:\chromedriver_win32\chromedriver.exe

chrome_driver_loglevel=DEBUG

说明

Selenium 已不再支持 PhantomJS,以前还可以通过降低jar包版本的方式解决该问题,但是现在已经不行了,即使版本再低,也不会引入PhantomJS相关的jar包了,所以webmagic作者开发的webmagic-selenium已经不能再用了(好像已经很久都没有更新版本了。。。),我这里的处理是下载源码,在源码的基础上将PhantomJS的相关内容都已经剔除了。

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值