前面介绍了使用request可以获取一个静态页面并且解析其中页面值,但是对于ajax这种动态渲染页面确无能为力,可以使用selenium模拟浏览器抓取页面数据。
需要安装 SeleniumWebdriver、ChromeDriver
安装selenium
pip install selenium
安装ChromeDriver
brew install wget --with-libressl
wget https://npm.taobao.org/mirrors/chromedriver/71.0.3578.33/chromedriver_mac64.zip
unzip chromedriver_mac64.zip
sudo mv chromedriver /usr/local/bin/chromedriver
sudo chmod u+x,o+x /usr/local/bin/chromedriver
python代码实例
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
driver = webdriver.Chrome()
driver.implicitly_wait(10)
driver.get("https://www.qq.com/")
# assert "百度一下" in driver.title
elem = driver.find_elements(By.XPATH, "//*[@id='tab-news-01']/ul[1]/li")
for target_list in elem:
print(target_list.find_element_by_tag_name("a").text())
driver.close()
java代码实例
import lombok.extern.slf4j.Slf4j;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import java.util.List;
import java.util.concurrent.TimeUnit;
@Slf4j
public class Test {
public static void main(String[] args) {
System.setProperty("webdriver.chrome.driver", "/usr/local/bin/chromedriver");// chromedriver服务地址
WebDriver driver = new ChromeDriver(); // 新建一个WebDriver 的对象,但是new 的是谷歌的驱动
String url = "https://www.qq.com/";
driver.get(url); // 打开指定的网站
try {
driver.manage().timeouts().implicitlyWait(1, TimeUnit.SECONDS); //webdriver中执行所有命令 的超时时间都设置为30秒了
} catch (Exception e) {
e.printStackTrace();
}
List<WebElement> webElements = driver.findElements(By.xpath("//*[@id='tab-news-01']/ul[1]/li"));//此处使用xpath
StringBuilder sb = new StringBuilder();
for (WebElement webElement : webElements) {
WebElement element = webElement.findElement(By.tagName("a"));
sb.append(element.getText() + "\n");
}
log.info(sb.toString());
driver.quit();
}
}
运行结果