python-爬取酒店信息

最新推荐文章于 2025-01-09 11:44:08 发布

aaaaaaxin

最新推荐文章于 2025-01-09 11:44:08 发布

阅读量2.5k

点赞数 3

分类专栏： python笔记文章标签： python 爬虫开发语言

本文链接：https://blog.csdn.net/qq_41121485/article/details/127208605

版权

python笔记专栏收录该内容

12 篇文章

订阅专栏

本文介绍了一个具体的爬虫案例，目标是从特定网站抓取酒店信息。通过Selenium和BeautifulSoup结合使用，实现了网页的自动操作和数据解析。具体步骤包括设置搜索条件、自动翻页和解析酒店详情。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

爬虫：爬取酒店信息

目标是爬取今天的酒店信息，并将这些信息存成文本文件。
功能分解：
    搜索功能，在搜索框输出地点和入住时间，点击搜索按钮
    获取一页完整的数据，由于去哪网一个页面数据分为两次加载，第一次加载数据，这时候需要将页面拉到底部，完成第二次数据加载。
    获取一页完整且渲染过的HTML文档后，使用BeautifulSoup将其中的酒店细腻些提取出来进行存储
    解析完成，点击下一页，继续抽取数据

import codecs
import datetime
import time

from cffi.backend_ctypes import unicode
from selenium import webdriver
from selenium.webdriver import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

url = 'xxxx'


class QunaSpider(object):

    def get_hotel(self, driver, to_city, fromdate, todate):
        # driver.refresh()
        ele_toCity = driver.find_element(by=By.XPATH,
                                         value='/html/body/div[3]/div/section/section[1]/div[1]/div[2]/div/div/input')
        ele_fromDate = driver.find_element(by=By.XPATH,
                                           value='/html/body/div[3]/div/section/section[1]/div[1]/div[3]/div/div[1]/div/input')
        ele_toDate = driver.find_element(by=By.XPATH,
                                         value='/html/body/div[3]/div/section/section[1]/div[1]/div[3]/div/div[2]/div/input')

        ele_search_div = driver.find_element(by=By.XPATH,
                                             value='/html/body/div[3]/div/section/section[1]/div[1]/div[5]')
        ele_search = driver.find_element(by=By.CLASS_NAME, value='main')

        ele_toCity.clear()
        ele_toCity.send_keys(to_city)
        time.sleep(1)
        ele_toCity.send_keys(Keys.ENTER)

        ele_fromDate.clear()
        ele_fromDate.send_keys(fromdate)

        ele_toDate.clear()
        ele_toDate.send_keys(todate)

        # driver_wait = WebDriverWait(driver, 60)
        # driver_wait.until(EC.element_to_be_clickable(ele_search))

        ele_search.click()

        page_num = 0
        while True:
            try:
                WebDriverWait(driver, 10).until(
                    EC.title_contains(to_city)
                )
            except Exception as e:
                print(e)
                break
            time.sleep(2)
            js = 'window.scrollTo(0, document.body.scrollHeight)'
            driver.execute_script(js)

            htm_const = driver.page_source
            soup = BeautifulSoup(htm_const, 'html.parser', from_encoding='utf-8')
            infos = soup.findAll(class_='hotel-card-detail-btn')

            f = codecs.open(to_city + fromdate + '.html', 'a', 'utf-8')
            f.write(str(page_num) + '--' * 50)
            f.write('\r\n')
            for info in infos:
                href = url + info['href']
                title = info['title']
                f.write(href + "   " + title)
                f.write('\r\n')
            f.close()
            try:
                time.sleep(5)
                next_page = WebDriverWait(driver, 10).until(
                    EC.visibility_of(driver.find_element(by=By.CLASS_NAME, value='next'))
                )
                next_page.click()
                page_num += 1
                time.sleep(8)
            except Exception as e:
                print(e)
                break

    def crawl(self, root_url, to_city):
        today = datetime.date.today().strftime('%Y-%m-%d')
        tomorrow = datetime.date.today() + datetime.timedelta(days=1)
        tomorrow = tomorrow.strftime('%Y-%m-%d')

        # 一些网站可以识别出是否使用了Selenium 下面进行绕过selenium检测方法之一
        option = webdriver.ChromeOptions()
        # option.add_argument('--headless')
        option.add_experimental_option('useAutomationExtension', False)
        option.add_experimental_option('excludeSwitches', ['enable-automation'])
        driver = webdriver.Chrome(options=option)  # 弹出浏览器，要给浏览器驱动的地址
        # 打开页面优先执行的js,execute_cdp_cmd
        driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
            "source": """
                    Object.defineProperty(navigator, 'webdriver', {
                      get: () => undefined
                    })
                  """
        })

        driver.implicitly_wait(10)
        driver.set_page_load_timeout(50)
        driver.get(root_url)
        driver.maximize_window()
        driver.implicitly_wait(10)

        self.get_hotel(driver, to_city, today, tomorrow)


if __name__ == '__main__':
    spider = QunaSpider()
    spider.crawl('xxxxx', u'长沙')