JAVA爬虫爬取携程酒店数据selenium实现

  在爬取携程的时候碰到很多的壁垒,接下来分析所有过程

  1.根据以往经验最初想到用jsoup去解析每个HTML元素,然后拿到酒店数据,然后发现解析HTML根本拿不到id为hotel_list的div,所以也就无法通过静态的HTML去获取数据

  可以看到标签里面根据就是没有数据的,因为这里的数据是动态的所有无法拿取,接下来采用动态拿取

 

2.第一种方法就不行, 于是疯狂查博文,找到了携程动态数据的接口,在AjaxHotelList.aspx里我找到了酒店,里面有HTML的代码拼接,数据都在这里了,怎么拿取呢?

 

 模拟post请求,然后拿数据

 在发送请求的时候注意下图红框中的信息。请求头必须要加上来源信息和游览器信息。发送的参数就是Form Data里的数据,可以只传部分数据。

 

请求发送后,很遗憾还是没有拿到数据,可能是一些加密的处理。

3.虽然模拟请求拿不到数据,但是大致方向还是找到了,还剩一下一种办法,就是selenium自动化测试框架模拟游览器从游览器页面中拿取数据。(由于能力有限,并没有破解汉字识别验证码,这里用人工验证代替)

准备:

        下载 selenium

        下载Chromedriver(这里需要与自己的Chorme游览器版本相对应,我在下载的时候最新的,后面放出连接)

 

上代码

import com.nf.xiecheng.entyty.Hotel;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import java.awt.*;
import java.awt.event.KeyEvent;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

public class SelectFlight {
    private final int MAX_X=2560;//用于滑块验证,电脑分辨率
    private final int MAX_Y=1408;
    private final int TARGET_X=MAX_X-733;//用于滑块验证,webdriver启动后,游览器中滑块验证的坐标位置
    private final int TARGET_Y=MAX_Y-477;
    private static List<Hotel> hotelList = new ArrayList<Hotel>();
    public static  void main(String args[]) throws InterruptedException {
        SelectFlight s = new SelectFlight();
        System.setProperty("webdriver.chrome.driver","D:\\myporject\\IDEworkspace\\chromedriver.exe");//chromedriver驱动地址,自己所放入的目录
        WebDriver webDriver = new ChromeDriver();
        webDriver.get("https://hotels.ctrip.com");
        Thread.sleep(1000);
        //跳转登陆页面
        WebElement login = ((ChromeDriver) webDriver).findElementByClassName("person-text");
        login.click();
        //登陆信息
        Thread.sleep(1000);
        WebElement phone = webDriver.findElement(By.id("nloginname"));
        phone.sendKeys("13647610831");
        WebElement passw = webDriver.findElement(By.id("npwd"));
        passw.sendKeys("a96968426");
        //滑块验证
        s.Robotcheck();
        Thread.sleep(10000);
        //点击登陆
        WebElement nsubmit = webDriver.findElement(By.id("nsubmit"));
        nsubmit.click();
        //点击酒店搜索
        Thread.sleep(2000);
        WebElement btnSearch = webDriver.findElement(By.id("btnSearch"));
        btnSearch.click();
        Thread.sleep(5000);
        //进入主页
        String pageSource = webDriver.getPageSource();
        Thread.sleep(1000);
        WebElement nextPage = webDriver.findElement(By.id("downHerf"));
        nextPage.click();
        //下一页
        WebElement downHerfa = s.getNextPage(webDriver, "downHerf");
        s.getHotelMassge(webDriver);//获取酒店信息
        for (Hotel ph:hotelList
             ) {
            System.out.println(ph.toString());
        }
        webDriver.close();
        webDriver.quit();

    }

    //获取下一页
    public WebElement getNextPage(WebDriver webDriver,String nextPage){
        WebElement nextWeb = webDriver.findElement(By.id(nextPage));
        nextWeb.click();
        return nextWeb;
    }
    //填入酒店信息
    public void getHotelMassge(WebDriver webDriver){
        WebElement hotel_list = webDriver.findElement(By.id("hotel_list"));
        List<WebElement> hotel_item = hotel_list.findElements(By.className("hotel_item"));
        System.err.println(hotel_item.size());
        Iterator<WebElement> it = hotel_item.iterator();
        while (it.hasNext()){
            Hotel entry = new Hotel();
            WebElement hotel = it.next();
            //酒店名称
            WebElement hotel_name = hotel.findElement(By.className("hotel_name"));
            WebElement a = hotel_name.findElement(By.tagName("a"));
            entry.setName(a.getAttribute("title"));
            //id
            String id = hotel_name.getAttribute("data-id");
            entry.setId(id);
            //酒店地址
            WebElement hotel_item_htladdress = hotel.findElement(By.className("hotel_item_htladdress"));
            List<WebElement> a_area = hotel_item_htladdress.findElements(By.tagName("a"));
            StringBuffer areabuffer = new StringBuffer();
            for(int i = 0; i<a_area.size();i++){
                areabuffer.append(a_area.get(i).getText()+",");
                if(i==a_area.size()-1){
                    areabuffer.append(a_area.get(i).getText());
                }
            }
            entry.setArea(areabuffer.toString());
            entry.setAddress(hotel_item_htladdress.getText());
            //客户点评
            WebElement hotelitem_judge_box = hotel.findElement(By.className("hotelitem_judge_box"));
            WebElement judge = hotelitem_judge_box.findElement(By.tagName("a"));
            entry.setEvaluate(judge.getAttribute("title"));
            //价钱
            WebElement hotel_price_icon = hotel.findElement(By.className("hotel_price_icon"));
            WebElement j_price_lowList = hotel_price_icon.findElement(By.className("J_price_lowList"));
            entry.setLowprice(Double.parseDouble(j_price_lowList.getText()));
            hotelList.add(entry);
        }
    }

    //滑块验证
    public  void Robotcheck(){
        //判断鼠标是否定位到指定位置
        boolean x_ready = false;
        boolean y_ready = false;
        int x_move = 0;
        int y_move = 0;
        try {
            Robot robot = new Robot();
            Point mousepoint = MouseInfo.getPointerInfo().getLocation();
            System.out.println(mousepoint.x+"  "+mousepoint.y);
            for(int i = 0;i<=MAX_X;i++){
                robot.mouseMove(i,mousepoint.y);
                Thread.sleep(1);
                if(MAX_X - i == TARGET_X||TARGET_X + i == MAX_X){
                    x_ready = true;
                    x_move = i;
                    break;
                }
            }
            for(int j = 0; j<=MAX_Y;j++){
                robot.mouseMove(x_move,j);
                Thread.sleep(1);
                if(MAX_Y - j == TARGET_Y || TARGET_Y + j == MAX_Y){
                    y_ready = true;
                    y_move = j;
                    break;
                }
            }
            robot.mouseMove(x_move,y_move);
            robot.mousePress(KeyEvent.BUTTON1_MASK);
            for(int i = 0; i < 300; i++){
                Thread.sleep(5);
                robot.mouseMove(x_move+i,y_move);
            }
            robot.mouseRelease(KeyEvent.BUTTON1_MASK);
        } catch (AWTException e) {
            e.printStackTrace();
        }catch (InterruptedException e) {
            e.printStackTrace();
        }

    }
}
import com.nf.xiecheng.entyty.Hotel;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import java.awt.*;
import java.awt.event.KeyEvent;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

public class SelectFlight {
    private final int MAX_X=2560;//用于滑块验证,电脑分辨率
    private final int MAX_Y=1408;
    private final int TARGET_X=MAX_X-733;//用于滑块验证,webdriver启动后,游览器中滑块验证的坐标位置
    private final int TARGET_Y=MAX_Y-477;
    private static List<Hotel> hotelList = new ArrayList<Hotel>();
    public static  void main(String args[]) throws InterruptedException {
        SelectFlight s = new SelectFlight();
        System.setProperty("webdriver.chrome.driver","D:\\myporject\\IDEworkspace\\chromedriver.exe");//chromedriver驱动地址,自己所放入的目录
        WebDriver webDriver = new ChromeDriver();
        webDriver.get("https://hotels.ctrip.com");
        Thread.sleep(1000);
        //跳转登陆页面
        WebElement login = ((ChromeDriver) webDriver).findElementByClassName("person-text");
        login.click();
        //登陆信息
        Thread.sleep(1000);
        WebElement phone = webDriver.findElement(By.id("nloginname"));
        phone.sendKeys("13647610831");
        WebElement passw = webDriver.findElement(By.id("npwd"));
        passw.sendKeys("a96968426");
        //滑块验证
        s.Robotcheck();
        //睡眠10秒,用于人工验证汉字识别
        Thread.sleep(10000);
        //点击登陆
        WebElement nsubmit = webDriver.findElement(By.id("nsubmit"));
        nsubmit.click();
        //点击酒店搜索
        Thread.sleep(2000);
        WebElement btnSearch = webDriver.findElement(By.id("btnSearch"));
        btnSearch.click();
        Thread.sleep(5000);
        //进入主页
        String pageSource = webDriver.getPageSource();
        Thread.sleep(1000);
        WebElement nextPage = webDriver.findElement(By.id("downHerf"));
        nextPage.click();
        //下一页
        WebElement downHerfa = s.getNextPage(webDriver, "downHerf");
        s.getHotelMassge(webDriver);//获取酒店信息
        for (Hotel ph:hotelList
             ) {
            System.out.println(ph.toString());
        }
        webDriver.close();
        webDriver.quit();

    }

    //获取下一页
    public WebElement getNextPage(WebDriver webDriver,String nextPage){
        WebElement nextWeb = webDriver.findElement(By.id(nextPage));
        nextWeb.click();
        return nextWeb;
    }
    //填入酒店信息
    public void getHotelMassge(WebDriver webDriver){
        WebElement hotel_list = webDriver.findElement(By.id("hotel_list"));
        List<WebElement> hotel_item = hotel_list.findElements(By.className("hotel_item"));
        System.err.println(hotel_item.size());
        Iterator<WebElement> it = hotel_item.iterator();
        while (it.hasNext()){
            Hotel entry = new Hotel();
            WebElement hotel = it.next();
            //酒店名称
            WebElement hotel_name = hotel.findElement(By.className("hotel_name"));
            WebElement a = hotel_name.findElement(By.tagName("a"));
            entry.setName(a.getAttribute("title"));
            //id
            String id = hotel_name.getAttribute("data-id");
            entry.setId(id);
            //酒店地址
            WebElement hotel_item_htladdress = hotel.findElement(By.className("hotel_item_htladdress"));
            List<WebElement> a_area = hotel_item_htladdress.findElements(By.tagName("a"));
            StringBuffer areabuffer = new StringBuffer();
            for(int i = 0; i<a_area.size();i++){
                areabuffer.append(a_area.get(i).getText()+",");
                if(i==a_area.size()-1){
                    areabuffer.append(a_area.get(i).getText());
                }
            }
            entry.setArea(areabuffer.toString());
            entry.setAddress(hotel_item_htladdress.getText());
            //客户点评
            WebElement hotelitem_judge_box = hotel.findElement(By.className("hotelitem_judge_box"));
            WebElement judge = hotelitem_judge_box.findElement(By.tagName("a"));
            entry.setEvaluate(judge.getAttribute("title"));
            //价钱
            WebElement hotel_price_icon = hotel.findElement(By.className("hotel_price_icon"));
            WebElement j_price_lowList = hotel_price_icon.findElement(By.className("J_price_lowList"));
            entry.setLowprice(Double.parseDouble(j_price_lowList.getText()));
            hotelList.add(entry);
        }
    }

    //滑块验证
    public  void Robotcheck(){
        //判断鼠标是否定位到指定位置
        boolean x_ready = false;
        boolean y_ready = false;
        int x_move = 0;
        int y_move = 0;
        try {
            Robot robot = new Robot();
            Point mousepoint = MouseInfo.getPointerInfo().getLocation();
            System.out.println(mousepoint.x+"  "+mousepoint.y);
            for(int i = 0;i<=MAX_X;i++){
                robot.mouseMove(i,mousepoint.y);
                Thread.sleep(1);
                if(MAX_X - i == TARGET_X||TARGET_X + i == MAX_X){
                    x_ready = true;
                    x_move = i;
                    break;
                }
            }
            for(int j = 0; j<=MAX_Y;j++){
                robot.mouseMove(x_move,j);
                Thread.sleep(1);
                if(MAX_Y - j == TARGET_Y || TARGET_Y + j == MAX_Y){
                    y_ready = true;
                    y_move = j;
                    break;
                }
            }
            robot.mouseMove(x_move,y_move);
            robot.mousePress(KeyEvent.BUTTON1_MASK);
            for(int i = 0; i < 300; i++){
                Thread.sleep(5);
                robot.mouseMove(x_move+i,y_move);
            }
            robot.mouseRelease(KeyEvent.BUTTON1_MASK);
        } catch (AWTException e) {
            e.printStackTrace();
        }catch (InterruptedException e) {
            e.printStackTrace();
        }

    }
}

 

 

 

 

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值