import time
import pandas as pd
from DrissionPage import ChromiumPage
page = ChromiumPage()
# data = ['酒店', '价格', '评分', '评价数', 'id', 'tags', 'lng', 'lat', 'address', 'area', 'poi']
total = 10192 # 这个城市的总酒店数,可以通过接口看到
page.listen.start('json/HotelSearch')
page.get('https://hotels.ctrip.com/hotels/list?countryId=1&city=17&provinceId=0&checkin=2024/10/08&checkout=2024/10/09&optionId=17&optionType=City&directSearch=0&display=%E6%9D%AD%E5%B7%9E&crn=1&adult=1&children=0&searchBoxArg=t&travelPurpose=0&ctm_ref=ix_sb_dl&domestic=1&')
count = 0
page_ = 1
while count < total:
print(f'正在采集第页{page_}数据')
data = []
if page_ > 4:
time.sleep(2)
try:
next_page = page.ele('css:.btn-box').click.multi()
except:
time.sleep(10)
next_page = page.ele('css:.btn-box').click.multi()
response = page.listen.wait()
json_data = response.response.body
hotel_list = json_data['Response']['hotelList']['list']
# 遍历提取信息
for hotel in hotel_list:
hotelName = hotel['base']['hotelName'] # 名称
hotelId = hotel['base']['hotelId'] # id
try:
price = hotel['money']['price'] # 价格
except:
price = 'none'
try:
score = hotel['score']['number'] # 评分
except:
score = 'none' # 评分
try:
tags = hotel['base']['tags']
except:
tags = 'none'
try:
lng = hotel['position']['lng']
except:
lng = 'none'
try:
lat = hotel['position']['lat']
except:
lat = 'none'
try:
address = hotel['position']['address']
except:
address = 'none'
try:
area = hotel['position']['area']
except:
area = 'none'
try:
poi = hotel['position']['poi']
except:
poi = 'none'
try:
comment = hotel['comment']['content'] # 评价数
except:
comment = 'none'
data.append([hotelName, price, score, comment, hotelId, tags, lng, lat, address, area, poi])
df = pd.DataFrame(data)
df.to_csv("/Users/xiaoming/Desktop/learning/python/data/hotel_xiecheng.csv", index=False, header=False, mode='a')
if page_ < 5:
page.scroll.to_bottom()
page_ = page_ + 1
count = count + len(hotel_list)
爬虫记录帖:携程某个城市的全部酒店数据
最新推荐文章于 2025-03-08 23:22:24 发布