爬虫代码:
import time
import csv
import re
from urllib import request
import json
c=open(r'D:\安吉竹博园开元度假村.csv','a+',newline='',encoding='utf8')
fieldnames=['user','time','score','content']
writer=csv.DictWriter(c,fieldnames=fieldnames)
writer.writeheader()
def getResponse(url):
'''
请求头信息(data)通过下面这个网站(据说是手机端网页)获得,其中26683709是酒店的ID号,而酒店的ID号就在原始网站的网页链接中
https://m.ctrip.com/webapp/hotel/HotelDetail/dianping/26683709.html
携程上安吉竹博园开元度假村原始网站:https://hotels.ctrip.com/hotel/26683709.html?isFull=F&masterhotelid=26683709&hcityid=659#ctm_ref=hod_sr_lst_dl_n_1_6
'''
data = {"hotelId": 26683709, "pageIndex": 2, "tagId": 0, "pageSize": 10, "groupTypeBitMap": 2,"needStatisticInfo": 0, "order": 0, "basicRoomName": "", "travelType": -1,"head": {"cid": "09031174312350135405", "ctok": "", "cver": "1.0", "lang": "01", "sid": "8888","syscode": "09", "auth": "93C8AE20D20009DC90E6E10BB588DE61E67EBBC236DE15433FDDADFD95636F28", "extension": []}}
data = json.dumps(data).encode(encoding='utf-8')#封装请求信息.json.dumps()用于将字典形式的数据转化为字符串
header_dict = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
"Content-Type": "application/json"}
url_request = request.Request(url=url, data=data, headers=header_dict)
url_response = request.urlopen(url_request)
return url_response
datas = []#存放生成的多个请求头
for j in range(6):
#使用"pageIndex":str(j + 1)进行翻页
data1 = {"hotelId": 26683709, "pageIndex":str(j + 1), "tagId": 0, "pageSize": 10, "groupTypeBitMap": 2,"needStatisticInfo": 0, "order": 0, "basicRoomName