爬虫:爬取酒店信息
目标是爬取今天的酒店信息,并将这些信息存成文本文件。
功能分解:
搜索功能,在搜索框输出地点和入住时间,点击搜索按钮
获取一页完整的数据,由于去哪网一个页面数据分为两次加载,第一次加载数据,这时候需要将页面拉到底部,完成第二次数据加载。
获取一页完整且渲染过的HTML文档后,使用BeautifulSoup将其中的酒店细腻些提取出来进行存储
解析完成,点击下一页,继续抽取数据
import codecs
import datetime
import time
from cffi.backend_ctypes import unicode
from selenium import webdriver
from selenium.webdriver import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
url = 'xxxx'
class QunaSpider(object):
def get_hotel(self, driver, to_city, fromdate, todate):
ele_toCity = driver.find_element(by=By.XPATH,
value='/html/body/div[3]/div/section/section[1]/div[1]/div[2]/div/div/input')
ele_fromDate = driver.find_element(by=By.XPATH,
value='/html/body/div[3]/div/section/section[1]/div[1]/div[3]/div/div[1]/div/input')
ele_toDate = driver.find_element(by=By.XPATH,
value='/html/body/div[3]/div/section/section[1]/div[1]/div[3]/div/div[2]/div/input')
ele_search_div = driver.find_element(by=By.XPATH,
value='/html/body/div[3]/div/section/section[1]/div[1]/div[5]')
ele_search = driver.find_element(by=By.CLASS_NAME, value='main')
ele_toCity.clear()
ele_toCity.send_keys(to_city)
time.sleep(1)
ele_toCity.send_keys(Keys.ENTER)
ele_fromDate.clear()
ele_fromDate.send_keys(fromdate)
ele_toDate.clear()
ele_toDate.send_keys(todate)
ele_search.click()
page_num = 0
while True:
try:
WebDriverWait(driver, 10).until(
EC.title_contains(to_city)
)
except Exception as e:
print(e)
break
time.sleep(2)
js = 'window.scrollTo(0, document.body.scrollHeight)'
driver.execute_script(js)
htm_const = driver.page_source
soup = BeautifulSoup(htm_const, 'html.parser', from_encoding='utf-8')
infos = soup.findAll(class_='hotel-card-detail-btn')
f = codecs.open(to_city + fromdate + '.html', 'a', 'utf-8')
f.write(str(page_num) + '--' * 50)
f.write('\r\n')
for info in infos:
href = url + info['href']
title = info['title']
f.write(href + " " + title)
f.write('\r\n')
f.close()
try:
time.sleep(5)
next_page = WebDriverWait(driver, 10).until(
EC.visibility_of(driver.find_element(by=By.CLASS_NAME, value='next'))
)
next_page.click()
page_num += 1
time.sleep(8)
except Exception as e:
print(e)
break
def crawl(self, root_url, to_city):
today = datetime.date.today().strftime('%Y-%m-%d')
tomorrow = datetime.date.today() + datetime.timedelta(days=1)
tomorrow = tomorrow.strftime('%Y-%m-%d')
option = webdriver.ChromeOptions()
option.add_experimental_option('useAutomationExtension', False)
option.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(options=option)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
driver.implicitly_wait(10)
driver.set_page_load_timeout(50)
driver.get(root_url)
driver.maximize_window()
driver.implicitly_wait(10)
self.get_hotel(driver, to_city, today, tomorrow)
if __name__ == '__main__':
spider = QunaSpider()
spider.crawl('xxxxx', u'长沙')