爬虫–selenium
一、selenium简介
selenium原本是用于网页自动化测试,由于其直接操作的浏览器的特点,因此可用于网页抓取,且不易被查封。
二、准备
- 下载Block-image_v1.0.crx ,用于禁止图片加载,这样可以加快访问速度(网上搜下即可下载);
- 下载chromedriver.exe , 即chome驱动器;
- 下载chrome浏览器;
三、开发步骤
- 安装chrome浏览器;
- 运行chromedriver.exe驱动器;
- 添加maven依赖
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-server</artifactId>
<version>3.0.1</version>
</dependency>
- 代码实例
public class ChromeCrawlerMain {
public static void main(String[] args) throws Exception {
chromeCrawler();
}
static void chromeCrawler() throws Exception {
String url = "http://www.ifeng.com/";
ChromeOptions ops = new ChromeOptions();
ops.addExtensions(new File("C:\\book\\Block-image_v1.0.crx")); //禁止加载图片插件
DesiredCapabilities dc = DesiredCapabilities.chrome();
dc.setCapability(ChromeOptions.CAPABILITY,ops);
dc.setBrowserName("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.32 Safari/537.36");
WebDriver driver = new RemoteWebDriver(new URL("http://localhost:9515"),dc); //driver的默认端口
driver.manage().timeouts().setScriptTimeout(10, TimeUnit.SECONDS);
driver.manage().timeouts().implicitlyWait(10,TimeUnit.SECONDS);
driver.manage().timeouts().pageLoadTimeout(10,TimeUnit.SECONDS);
driver.get(url);
// String html = driver.getPageSource();
WebElement headlineEle = driver.findElement(By.cssSelector("#headLineDefault > ul > ul:nth-child(1) > li.topNews > h1 > a"));
String headline = headlineEle.getText();
System.out.println(headline);
}
}