1.前提工作。
一、注入对应的依赖jar包
https://mvnrepository.com/
可以从以上网站找到对应的依赖,然后注入maven仓库中。
<!-- Selemiun -->
<!-- https://mvnrepository.com/artifact/org.seleniumhq.selenium/selenium-java -->
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.141.59</version>
</dependency>
二、准备浏览器的驱动器。
http://chromedriver.storage.googleapis.com/index.html
若版本太新找不到则参考该网址
https://chromedriver.com/
https://chromedriver.com/download
可以在这个网址下载对应的版本的驱动器,我使用的是谷歌的,所以下载谷歌对应版本的驱动器。
谷歌版本:
驱动器:(此处需要下载对应的,win统一用win32)
下载完成后解压驱动器到自定义路径。
我的是:
E:\selenium\chromedriver.exe
2.操作案例:
①打开某个网页并让网页滚动,抓取相对应的元素
package com.xp.climb.selenium;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.firefox.FirefoxDriver;
public class SinaNewsRolling {
public static void main(String[] args) throws IOException, InterruptedException {
//chromedriver配置
System.setProperty("webdriver.chrome.driver", "E:\\selenium\\chromedriver.exe");
//声明使用的是谷歌浏览器
ChromeDriver driver = new ChromeDriver();
//使用火狐浏览器打开任意某一新闻页
driver.get("http://k.sina.com.cn/article_6436034945_17f9e198100100krct.html?from=home");
// 执行JS操作
JavascriptExecutor JS = (JavascriptExecutor) driver;
try {
JS.executeScript("scrollTo(0, 5000)");
System.out.println("1");
Thread.sleep(5000); //调整休眠时间可以获取更多的内容
JS.executeScript("scrollTo(5000, 10000)");
System.out.println("2");
Thread.sleep(5000);
JS.executeScript("scrollTo(10000, 30000)"); // 继续下拉
System.out.println("3");
Thread.sleep(5000);
JS.executeScript("scrollTo(10000, 50000)"); //继续下拉
System.out.println("4");
} catch (Exception e) {
System.out.println("Error at loading the page ...");
driver.quit();
}
String html = driver.getPageSource();
// System.out.println(html);
//解析数据
Document doc = Jsoup.parse(html);
Elements elements = doc.select("[id=tycard_list]")
.select("div[class=ty-card ty-card-type1 clearfix]");
for (Element ele : elements) {
String newsTitle = ele.select("h3[class=ty-card-tt]").select("a").text();
String newsUrl = ele.select("h3[class=ty-card-tt]").select("a").attr("href");
System.out.println(newsTitle + "\t" + newsUrl);
}
driver.quit(); // 关闭浏览器
}
}
②打开qq邮箱并自动登录,点击收件箱。抓取相对应的元素
package com.xp.climb.selenium;
import java.io.IOException;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.firefox.FirefoxDriver;
public class LoginQqEmail {
public static void main(String[] args) throws IOException, InterruptedException {
//chromedriver配置
System.setProperty("webdriver.chrome.driver", "E:\\selenium\\chromedriver.exe");
//声明使用的是谷歌浏览器
ChromeDriver driver = new ChromeDriver();
//使用谷歌浏览器打开QQ邮箱网页
driver.get("https://mail.qq.com/");
//元素定位,提交用户名以及密码
driver.manage().timeouts().implicitlyWait(5,TimeUnit.SECONDS);
//快捷登录。
// driver.switchTo().frame("login_frame").findElement(By.id("img_out_995536807")).click();
driver.switchTo().frame("login_frame").findElement(By.id("switcher_plogin")).click();
Thread.sleep(5000);
driver.findElement(By.id("u")).clear(); //清空后输入
driver.findElement(By.id("u")).sendKeys("换成你的账号");
driver.manage().timeouts().implicitlyWait(5, TimeUnit.SECONDS);
driver.findElement(By.id("p")).clear(); //清空后输入
driver.findElement(By.id("p")).sendKeys("换成你的密码");
// //元素定位,点击登陆按钮
driver.findElement(By.id("login_button")).click();
Thread.sleep(10*1000); //休息一段时间,使得网页充分加载。注意这里非常有必要
driver.findElement(By.id("folder_1")).click();
Thread.sleep(5*1000);
Set<Cookie> cookies = driver.manage().getCookies();
//获取登陆的cookies
String cookieStr = "";
for (Cookie cookie : cookies) {
cookieStr += cookie.getName() + "=" + cookie.getValue() + "; ";
}
System.out.println(cookieStr);
//基于Jsoup,使用cookies请求个人信息页面
Response orderResp = Jsoup //添加一些header信息
.connect("https://mail.qq.com/cgi-bin/frame_html?sid=Se7bEL8SNGKkATpa&r=36b065894a56b95efa607a2b42377adb")
// .header("Host", "www.renren.com")
.header("Connection", "keep-alive")
.header("Cache-Control", "max-age=0")
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*;q=0.8")
// .header("Origin", "http://www.renren.com")
.header("Referer", "https://mail.qq.com/")
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0")
.header("Content-Type", "application/x-www-form-urlencoded")
.header("Accept-Encoding", "gzip, deflate, br")
.header("Upgrade-Insecure-Requests", "1")
.cookie("Cookie", cookieStr)
.execute();
//解析数据
Document doc = orderResp.parse();
// System.out.println(doc);
org.jsoup.select.Elements elements = doc.select("iframe[id=mainFrame]")
.select("div[class=tf no]");
for (Element element : elements) {
if (element.text().contains("博客园登录用户名")) {
System.out.println(element.text());
}
}
driver.quit(); // 关闭浏览器
}
}
③打开网页,将验证码图片读取,存入本地,在控制台中输入对应的验证码,通过验证。抓取相对应的元素
package com.xp.climb.selenium;
import java.awt.image.BufferedImage;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.concurrent.TimeUnit;
import javax.imageio.ImageIO;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.openqa.selenium.By;
import org.openqa.selenium.OutputType;
import org.openqa.selenium.Point;
import org.openqa.selenium.TakesScreenshot;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.firefox.FirefoxBinary;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.firefox.FirefoxOptions;
public class ScreenshotTest {
public static void main(String[] args) throws IOException, InterruptedException {
// FirefoxBinary firefoxBinary = new FirefoxBinary();
// firefoxBinary.addCommandLineOptions("--headless");
//设置路径
System.setProperty("webdriver.chrome.driver", "E:\\selenium\\chromedriver.exe");
// FirefoxOptions firefoxOptions = new FirefoxOptions();
// firefoxOptions.setBinary(firefoxBinary);
// FirefoxDriver driver = new FirefoxDriver(firefoxOptions);
ChromeDriver driver = new ChromeDriver();
//直到加载该网页为止
while (true){
try{
driver.get("http://weixin.sogou.com/antispider/?"
+ "from=%2fweixin%3Ftype%3d2%26query"
+ "%3dcomputer+%26ie%3dutf8%26s_from%"
+ "3dinput%26_sug_%3dy%26_sug_type_%3d");
}
catch (Exception e)
{
driver.quit();
driver = new ChromeDriver();
driver.manage().timeouts()
.pageLoadTimeout(10, TimeUnit.SECONDS);
continue;
}
break;
}
//此处获取页面的图片。
WebElement webEle = driver.findElement(By.id("seccodeImage"));
// Get entire page screenshot
//读取图片
File screenshot = ((TakesScreenshot)driver)
.getScreenshotAs(OutputType.FILE);
BufferedImage fullImg = ImageIO.read(screenshot);
//-------
Point point = webEle.getLocation();//获取图片位置
int eleWidth = webEle.getSize().getWidth();//宽度
int eleHeight = webEle.getSize().getHeight();//高度
BufferedImage eleScreenshot = fullImg.getSubimage(point.getX(), point.getY(),
eleWidth, eleHeight);//缓存图片
//写入图片到本地(输出)
ImageIO.write(eleScreenshot, "png", new File("E:/selenium/test.png"));
System.out.println("请输入验证码:");
BufferedReader buff=new BufferedReader(new InputStreamReader(System.in));//输入缓存区
String captcha_solution="";
try {
captcha_solution = buff.readLine();
} catch (IOException e) {
e.printStackTrace();
}
driver.findElement(By.name("c")).sendKeys(captcha_solution);
driver.findElementById("submit").click();
Thread.sleep(10*1000); //休息一段时间,使得网页充分加载。注意这里非常有必要
String html = driver.getPageSource();//获取网页
Document doc = Jsoup.parse(html);//解析网页
Elements elements = doc.select("div[class=txt-box]");
for (Element ele : elements) {
String newsTitle = ele.select("h3").select("a").text();
String newsUrl = ele.select("h3").select("a").attr("href");
System.out.println(newsTitle + "\t" + newsUrl);
}
driver.quit(); // 关闭浏览器
}
}