高考刚结束,堂侄面临志愿投递的问题,之前用jsoup模拟抓取过不少网站,但是抓取高考信息网(gkcx.eol.cn)的时候遇到了问题,部分接口返回的内容为密文,查看网站js,应该是采用了PBKDF2加密方式,而且加密方式也挺复杂,走进了死胡同。
google查询类似问题,发现了selenium可以用来实现浏览器自动化功能:模拟用户点击,操作浏览器的组件实现翻页等。
https://gkcx.eol.cn/linespecialty?province=&zytype=&schoolyear=2018&schoolpc=&luqutype=理科
模拟此页面的访问,发现问题如下:
1.部分专业默认展示的条目数不全,需要再次点击专业门类下对应的专业(根据cssSelector获取到对应的专业对应的dom,并点击,此处添加了超时等待并重新获取页面的dom数据)
2.需要点击末页获取总页数(模拟点击);
3.非末页的数据需要点击下一页查看分页数据
4.ajax页面内容请求超时设置等待时间(超时等待新的dom重新加载完成)
首先定义一个wait,模拟超时等待
WebDriverWait wait = new WebDriverWait(webDriver, 15);
发生dom变化时
wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("#.")));//cssSelector可采用jquery选择器对应的语法
5.数据存储方式(定为excel,采用poi工具进行存储)
windows版:
webdriver下载 chromedriver.exe
package com.wyg.gkspider.spider;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.safari.SafariDriver;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.*;
import java.util.concurrent.TimeUnit;
public class WebDriverTest {
public static List<Map> getSpecialScoreInfoList(List<Map> result,String zytype,String schoolyear,String luqutype,String city){
if (result == null){
result = new LinkedList<>();
}
System.setProperty("webdriver.chrome.driver","C:\\Users\\wuyinggui\\Downloads\\chromedriver.exe");
WebDriver webDriver = new ChromeDriver();
try {
webDriver.manage().window().maximize();
WebDriverWait wait = new WebDriverWait(webDriver, 60);
//与浏览器同步非常重要,必须等待浏览器加载完毕
webDriver.manage().timeouts().implicitlyWait(60, TimeUnit.SECONDS);
webDriver.get("https://gkcx.eol.cn/linespecialty?province=&zytype="+URLEncoder.encode(zytype)+"&schoolyear="+schoolyear+"&schoolpc=&luqutype="+URLEncoder.encode(luqutype)+"&argschtype=&schoolflag=&recomschprop=");
wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("p[class='changecity']")));
WebElement changecity = webDriver.findElement(By.cssSelector("p[class='changecity']"));
changecity.click();
wait.until(ExpectedConditions.presenceOfAllElementsLocatedBy(By.cssSelector(".content-city")));
List<WebElement> contentCitys = webDriver.findElements(By.cssSelector(".content-city"));
for (WebElement contentCity : contentCitys) {
String text = contentCity.getText();
if (city.equals(text)) {
JavascriptExecutor jse = (JavascriptExecutor)webDriver;
jse.executeScript("arguments[0].scrollIntoView()", contentCity);
contentCity.click();
break;
}
}
wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("#openCondation")));
WebElement openCondation = webDriver.findElement(By.cssSelector("#openCondation"));
JavascriptExecutor jse = (JavascriptExecutor)webDriver;
jse.executeScript("arguments[0].scrollIntoView()", openCondation);
openCondation.click();
wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector(".fypages")));
WebElement fypages = webDriver.findElement(By.className("fypages"));
Integer current = 1;
if (fypages != null){
List<WebElement> liElements;//fypages.findElements(By.tagName("li"));
//Integer max = Integer.parseInt(liElements.get(liElements.size()-3).getText());
while (true){
try {
System.out.println("current page is :" + current);
wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("tbody tr")));
List<WebElement> tbody_trs = webDriver.findElements(By.cssSelector("tbody tr"));
if (tbody_trs != null && !tbody_trs.isEmpty()){
for (WebElement tbody_tr : tbody_trs) {
System.out.println(tbody_tr.getText());
List<WebElement> tds = tbody_tr.findElements(By.tagName("td"));
Map<String,String> map = new HashMap<>();
map.put("schoolName",tds.get(0).getText());
map.put("specialName",tds.get(1).getText());
map.put("address",tds.get(2).getText());
map.put("studentType",tds.get(3).getText());
map.put("year",schoolyear);
map.put("batchName",tds.get(4).getText());
map.put("average",tds.get(5).getText());
map.put("minScore",tds.get(6).getText());
result.add(map);
}
}
wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector(".fypages")));
if (tbody_trs.size() < 20){
break;
}
fypages = webDriver.findElement(By.className("fypages"));
liElements = fypages.findElements(By.tagName("li"));
label:for (WebElement liElement : liElements) {
if ("下一页".equals(liElement.getText())){
//Thread.sleep(30);
liElement.click();
break label;
}
}
webDriver.manage().timeouts().implicitlyWait(10, TimeUnit.SECONDS);
current ++;
}catch (Exception e){
e.printStackTrace();
break;
}
}
return result;
}
}catch (Exception e){
e.printStackTrace();
webDriver.close();
getSpecialScoreInfoList(new LinkedList<>(),zytype,schoolyear,luqutype,city);
}finally {
webDriver.close();
}
return result;
}
public static void main(String[] args) {
String[] schoolpcs = new String[]{"本科提前批","本科一批","本科二批","国家专项计划本科批","专科批"};
try {
String year = "2017";
// Integer countAllPage = getPageNum("",year,"理科");
// System.out.println(countAllPage);
List<Map> specialScoreInfoList = getSpecialScoreInfoList(null, "", year, "理科","南阳市");
ExcelUtil.exportToExcel(year+"年.xls", year, specialScoreInfoList, Arrays.asList(new String[]{"schoolName","specialName","address","studentType","year","batchName","average","minScore"}),
Arrays.asList(new String[]{"学校名称","专业名称","招生地址","考生类别","年份","录取批次","平均分","最低分"}));
} catch (Exception e) {
e.printStackTrace();
}
}
}