不说太多废话,就简单一句:你们你要爬哪里可以把地点改一下,还有时间改一下,爬取数量自己修改参数和代码,变化不大。有问题请留言,我不再次废话分析(这里我爬取的上海最近的酒店信息)
# coding=utf-8
import csv#用来储存文件的模块
import time
import requests
import json
import pandas as pd#excel出处理
# 区域店铺id ct_Poi cateName抓取,传入参数为区域id
def crow_id(city):
url = 'https://wxapp.qunar.com/api/hotel/hotellist'#目标网址
headers = {
"wx-v": "",
"content-type": "application/json",
"Connection": "Keep-Alive",
"Accept-Encoding": "gzip",
"wx-q": "",
"unionid": "ovaMOwE6dQvbGOmZjLLPaGSM5ZtU",
"openid": "oIjYJ0TuQcTF_WTWsKcUPR1cRJI0",
"wx-t": "",
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0.1; OPPO A57 Build/MMB29M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/55.0.2883.91 Mobile Safari/537.36 MicroMessenger/6.7.2.1340(0x2607023A) NetType/WIFI Language/zh_CN",
"charset": "utf-8",
"referer": "https://servicewechat.com/wx799d4d93a341b368/114/page-frame.html",
"Host": "wxapp.qunar.com",
"Cookie": "QN48=tc_437f21c62a765ca0_165c198a408_e56b; QN1=qunar; QN66=smart_app; QN1=O5cv+luWLPthsvB1BKl0Ag==",
"Content-Length": "0",
}
#请求头和cookie
p0 = {'http': 'http://101.132.122.230:3128'}
p1 = {'http': 'http://114.113.126.83:80'}
p2 = {'http': 'http://210.45.123.127:9999'}
p3 = {'http': 'http://118.190.217.182:80'}
p4 = {'http': 'http://120.27.14.125:80'}
p5 = {'http': 'http://118.31.223.194:3128'}
p6 = {'http': 'http://101.37.79.125:3128'}
p7 = {'http': 'http://125.62.26.197:3128'}
p8 = {'http': 'http://218.60.8.98:3129'}
p9 = {'http': 'http://114.215.95.188:3128'}
p10 = {'http': 'http://218.60.8.99:3129'}
p11 = {'http': 'http://218.60.8.83:3129'}
p12 = {'http': 'http://118.190.217.61:80'}
p13 = {'http': 'http://203.86.26.9:3128'}
p14 = {'http': 'http://114.113.126.87:80'}
p15 = {'http': 'http://106.12.32.43:3128'}
#爬取不同页网址
p = p1
page = 1
#抓取我们需要的数据
data = {
"city": city,
"cityUrl": "",
"page": page,
"extra": "{}",
"sort": "",
"keywords": "",
"checkOutDate": "2020-10-29",
"checkInDate": "2020-10-29",
"locationAreaFilter": "",
"comprehensiveFilter": "[]",
"fixedComprehensiveFilter": "[]",
"SDKVersion": "2.2.4",
"wxUnionId": "ovaMOwE6dQvbGOmZjLLPaGSM5ZtU",
"wxOpenId": "oIjYJ0TuQcTF_WTWsKcUPR1cRJI0",
"bd_source": "smart_app",
"bd_origin": "pt-onl-ots-ggjd",
}
r = requests.post(url, headers=headers, params=data, proxies=p)
result = json.loads(r.text)
pages = result['data']['totalPage']
# pages=586
hotel = result['data']
# attrs = hotel['attrs']
print("当前总页数:",pages)
print("Page:%d" %page)
print(len(hotel), pages)
df = pd.DataFrame(data=hotel['hotels'])
df.to_csv('qunaer9.csv', mode='a', header=False)
df.drop(df.index, inplace=True)
if pages > 1:
pages = pages - page
page +=1
while pages >=0:
data2 = {
"city": city,
"cityUrl": "",
"page": page,
"extra": "{}",
"sort": "",
"keywords": "",
"checkOutDate": "2020-11-2",
"checkInDate": "2020-11-1",
"locationAreaFilter": "",
"comprehensiveFilter": "[]",
"fixedComprehensiveFilter": "[]",
"SDKVersion": "2.2.4",
"wxUnionId": "ovaMOwE6dQvbGOmZjLLPaGSM5ZtU",
"wxOpenId": "oIjYJ0TuQcTF_WTWsKcUPR1cRJI0",
"bd_source": "smart_app",
"bd_origin": "pt-onl-ots-ggjd",
}
try:
r = requests.post(url, headers=headers, params=data2, proxies=p)
print(len(hotel), pages)
print(page)
result = json.loads(r.text)
hotel = result['data']
# attrs = hotel['attrs']
df = pd.DataFrame(data=hotel['hotels'])
df.to_csv('qunaer9.csv',mode='a',header=False)
df.drop(df.index,inplace=True)
except Exception as e:
print(e)
finally:
print("Page:%d" %page)
pages -= 1
page = page+1
time.sleep(3.1)
if __name__ == '__main__':
a = {"areaObj": {
"上海": [{"city": '上海'}]
}}
datas = a['areaObj']
b = datas.values()
area_list = []
for data in b:
for d in data[0:]:
area_list.append(d)
l = 0
old = time.time()
for i in range(len(area_list)):
print("开始抓取%s区域:" % (area_list[i]['city']))
crow_id(area_list[i]['city'])