利用selenium访问某网站实现模拟人机交互爬虫

   高考刚结束,堂侄面临志愿投递的问题,之前用jsoup模拟抓取过不少网站,但是抓取高考信息网(gkcx.eol.cn)的时候遇到了问题,部分接口返回的内容为密文,查看网站js,应该是采用了PBKDF2加密方式,而且加密方式也挺复杂,走进了死胡同。

   google查询类似问题,发现了selenium可以用来实现浏览器自动化功能:模拟用户点击,操作浏览器的组件实现翻页等。

   https://gkcx.eol.cn/linespecialty?province=&zytype=&schoolyear=2018&schoolpc=&luqutype=理科

  

   模拟此页面的访问,发现问题如下:

   1.部分专业默认展示的条目数不全,需要再次点击专业门类下对应的专业(根据cssSelector获取到对应的专业对应的dom,并点击,此处添加了超时等待并重新获取页面的dom数据)

   2.需要点击末页获取总页数(模拟点击);

   3.非末页的数据需要点击下一页查看分页数据

   4.ajax页面内容请求超时设置等待时间(超时等待新的dom重新加载完成)

       首先定义一个wait,模拟超时等待

    WebDriverWait wait = new WebDriverWait(webDriver, 15);

          发生dom变化时

         wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("#.")));//cssSelector可采用jquery选择器对应的语法

   5.数据存储方式(定为excel,采用poi工具进行存储)

windows版:

webdriver下载 chromedriver.exe

package com.wyg.gkspider.spider;

import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.safari.SafariDriver;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;


import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.*;
import java.util.concurrent.TimeUnit;

public class WebDriverTest {

    public static List<Map> getSpecialScoreInfoList(List<Map> result,String zytype,String schoolyear,String luqutype,String city){
        if (result == null){
            result = new LinkedList<>();
        }
        System.setProperty("webdriver.chrome.driver","C:\\Users\\wuyinggui\\Downloads\\chromedriver.exe");
        WebDriver webDriver = new ChromeDriver();
        try {
            webDriver.manage().window().maximize();
            WebDriverWait wait = new WebDriverWait(webDriver, 60);
            //与浏览器同步非常重要,必须等待浏览器加载完毕
            webDriver.manage().timeouts().implicitlyWait(60, TimeUnit.SECONDS);
            webDriver.get("https://gkcx.eol.cn/linespecialty?province=&zytype="+URLEncoder.encode(zytype)+"&schoolyear="+schoolyear+"&schoolpc=&luqutype="+URLEncoder.encode(luqutype)+"&argschtype=&schoolflag=&recomschprop=");
            wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("p[class='changecity']")));

            WebElement changecity = webDriver.findElement(By.cssSelector("p[class='changecity']"));
            changecity.click();

            wait.until(ExpectedConditions.presenceOfAllElementsLocatedBy(By.cssSelector(".content-city")));
            List<WebElement> contentCitys = webDriver.findElements(By.cssSelector(".content-city"));
            for (WebElement contentCity : contentCitys) {
                String text = contentCity.getText();
                if (city.equals(text)) {
                    JavascriptExecutor jse = (JavascriptExecutor)webDriver;
                    jse.executeScript("arguments[0].scrollIntoView()", contentCity);
                    contentCity.click();
                    break;
                }
            }
            wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("#openCondation")));
            WebElement openCondation = webDriver.findElement(By.cssSelector("#openCondation"));
            JavascriptExecutor jse = (JavascriptExecutor)webDriver;
            jse.executeScript("arguments[0].scrollIntoView()", openCondation);
            openCondation.click();


            wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector(".fypages")));
            WebElement fypages = webDriver.findElement(By.className("fypages"));
            Integer current = 1;
            if (fypages != null){
                List<WebElement> liElements;//fypages.findElements(By.tagName("li"));
                //Integer max = Integer.parseInt(liElements.get(liElements.size()-3).getText());
                while (true){
                    try {
                        System.out.println("current page is :" + current);
                        wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("tbody tr")));
                        List<WebElement> tbody_trs = webDriver.findElements(By.cssSelector("tbody tr"));

                        if (tbody_trs != null && !tbody_trs.isEmpty()){
                            for (WebElement tbody_tr : tbody_trs) {
                                System.out.println(tbody_tr.getText());
                                List<WebElement> tds = tbody_tr.findElements(By.tagName("td"));
                                Map<String,String> map = new HashMap<>();
                                map.put("schoolName",tds.get(0).getText());
                                map.put("specialName",tds.get(1).getText());
                                map.put("address",tds.get(2).getText());
                                map.put("studentType",tds.get(3).getText());
                                map.put("year",schoolyear);
                                map.put("batchName",tds.get(4).getText());
                                map.put("average",tds.get(5).getText());
                                map.put("minScore",tds.get(6).getText());
                                result.add(map);
                            }
                        }
                        wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector(".fypages")));
                        if (tbody_trs.size() < 20){
                            break;
                        }
                        fypages = webDriver.findElement(By.className("fypages"));
                        liElements = fypages.findElements(By.tagName("li"));
                        label:for (WebElement liElement : liElements) {
                            if ("下一页".equals(liElement.getText())){
                                //Thread.sleep(30);
                                liElement.click();
                                break label;
                            }
                        }

                        webDriver.manage().timeouts().implicitlyWait(10, TimeUnit.SECONDS);
                        current ++;
                    }catch (Exception e){
                        e.printStackTrace();
                        break;
                    }

                }
                return result;
            }
        }catch (Exception e){
            e.printStackTrace();
            webDriver.close();
            getSpecialScoreInfoList(new LinkedList<>(),zytype,schoolyear,luqutype,city);
        }finally {
            webDriver.close();
        }
        return result;
    }

    public static void main(String[] args) {

        String[] schoolpcs = new String[]{"本科提前批","本科一批","本科二批","国家专项计划本科批","专科批"};
        try {
            String year = "2017";
//            Integer countAllPage  = getPageNum("",year,"理科");
//            System.out.println(countAllPage);
            List<Map> specialScoreInfoList = getSpecialScoreInfoList(null, "", year, "理科","南阳市");
            ExcelUtil.exportToExcel(year+"年.xls", year,  specialScoreInfoList, Arrays.asList(new String[]{"schoolName","specialName","address","studentType","year","batchName","average","minScore"}),
                    Arrays.asList(new String[]{"学校名称","专业名称","招生地址","考生类别","年份","录取批次","平均分","最低分"}));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

   

  • 2
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值