python-爬取酒店信息

本文介绍了一个具体的爬虫案例,目标是从特定网站抓取酒店信息。通过Selenium和BeautifulSoup结合使用,实现了网页的自动操作和数据解析。具体步骤包括设置搜索条件、自动翻页和解析酒店详情。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

爬虫:爬取酒店信息
目标是爬取今天的酒店信息,并将这些信息存成文本文件。
功能分解:
    搜索功能,在搜索框输出地点和入住时间,点击搜索按钮
    获取一页完整的数据,由于去哪网一个页面数据分为两次加载,第一次加载数据,这时候需要将页面拉到底部,完成第二次数据加载。
    获取一页完整且渲染过的HTML文档后,使用BeautifulSoup将其中的酒店细腻些提取出来进行存储
    解析完成,点击下一页,继续抽取数据
import codecs
import datetime
import time

from cffi.backend_ctypes import unicode
from selenium import webdriver
from selenium.webdriver import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

url = 'xxxx'


class QunaSpider(object):

    def get_hotel(self, driver, to_city, fromdate, todate):
        # driver.refresh()
        ele_toCity = driver.find_element(by=By.XPATH,
                                         value='/html/body/div[3]/div/section/section[1]/div[1]/div[2]/div/div/input')
        ele_fromDate = driver.find_element(by=By.XPATH,
                                           value='/html/body/div[3]/div/section/section[1]/div[1]/div[3]/div/div[1]/div/input')
        ele_toDate = driver.find_element(by=By.XPATH,
                                         value='/html/body/div[3]/div/section/section[1]/div[1]/div[3]/div/div[2]/div/input')

        ele_search_div = driver.find_element(by=By.XPATH,
                                             value='/html/body/div[3]/div/section/section[1]/div[1]/div[5]')
        ele_search = driver.find_element(by=By.CLASS_NAME, value='main')

        ele_toCity.clear()
        ele_toCity.send_keys(to_city)
        time.sleep(1)
        ele_toCity.send_keys(Keys.ENTER)

        ele_fromDate.clear()
        ele_fromDate.send_keys(fromdate)

        ele_toDate.clear()
        ele_toDate.send_keys(todate)

        # driver_wait = WebDriverWait(driver, 60)
        # driver_wait.until(EC.element_to_be_clickable(ele_search))

        ele_search.click()

        page_num = 0
        while True:
            try:
                WebDriverWait(driver, 10).until(
                    EC.title_contains(to_city)
                )
            except Exception as e:
                print(e)
                break
            time.sleep(2)
            js = 'window.scrollTo(0, document.body.scrollHeight)'
            driver.execute_script(js)

            htm_const = driver.page_source
            soup = BeautifulSoup(htm_const, 'html.parser', from_encoding='utf-8')
            infos = soup.findAll(class_='hotel-card-detail-btn')

            f = codecs.open(to_city + fromdate + '.html', 'a', 'utf-8')
            f.write(str(page_num) + '--' * 50)
            f.write('\r\n')
            for info in infos:
                href = url + info['href']
                title = info['title']
                f.write(href + "   " + title)
                f.write('\r\n')
            f.close()
            try:
                time.sleep(5)
                next_page = WebDriverWait(driver, 10).until(
                    EC.visibility_of(driver.find_element(by=By.CLASS_NAME, value='next'))
                )
                next_page.click()
                page_num += 1
                time.sleep(8)
            except Exception as e:
                print(e)
                break

    def crawl(self, root_url, to_city):
        today = datetime.date.today().strftime('%Y-%m-%d')
        tomorrow = datetime.date.today() + datetime.timedelta(days=1)
        tomorrow = tomorrow.strftime('%Y-%m-%d')

        # 一些网站可以识别出是否使用了Selenium 下面进行绕过selenium检测方法之一
        option = webdriver.ChromeOptions()
        # option.add_argument('--headless')
        option.add_experimental_option('useAutomationExtension', False)
        option.add_experimental_option('excludeSwitches', ['enable-automation'])
        driver = webdriver.Chrome(options=option)  # 弹出浏览器,要给浏览器驱动的地址
        # 打开页面优先执行的js,execute_cdp_cmd
        driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
            "source": """
                    Object.defineProperty(navigator, 'webdriver', {
                      get: () => undefined
                    })
                  """
        })

        driver.implicitly_wait(10)
        driver.set_page_load_timeout(50)
        driver.get(root_url)
        driver.maximize_window()
        driver.implicitly_wait(10)

        self.get_hotel(driver, to_city, today, tomorrow)


if __name__ == '__main__':
    spider = QunaSpider()
    spider.crawl('xxxxx', u'长沙')

评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值