同程旅游景点门票的爬取实现
还是和往常一样先获取根据关键词检索的页面,如上图,然后获取景点url和id,进入符合的景点详情页面再做进一步分析。
这次我们通过开发者工具抓包发现,有一个包含着景点门票数据的json文件被传入,所以我们需要做的就是获取访问这个json文件所需要的参数即可。
而进一步观察发现,我们所需要获取的其实就只有一个参数,id,而景点id可以从检索结果页面得到,所以门票数据就可以成功爬取出来了:
def search_spots(self, keyword, city):
'''
:param keyword: 关键词
:param city: 在哪所城市搜索
:return:
'''
Ncity = city.replace('市', '').replace('县', '').replace('省', '')
url = 'https://so.ly.com/scenery?q='+Ncity+keyword
p = 1
try:
html = self.getHtml(url)
soup = BeautifulSoup(html,"html.parser")
divs = soup.find_all('div',{'class':'list_l'})
for div in divs:
sid = div.find('div',{'class':'s_info'})['sid']
title = div.find('a',{'class':'sce_name goFinal'})['title']
'''筛选无关信息'''
result = fuzz.token_sort_ratio(title, keyword)
if result <= 20:
continue
detailurl = 'https://so.ly.com/scenery/AjaxHelper/SceneryPriceFrame.aspx?action=GETNEWFRAMEFORLIST&showCount=2&ids='+str(sid)+'&isSimple=1&priceList=1&tabself=1&tabHotel=1&isGrap=1&nobookid=&isyry=1&YpState=1&lon=0&lat=0&isforegin=0&isNewSearch=true&iid=0.398138641670454'
detailhtml = self.getHtml(detailurl)
ticketInfo = json.loads(detailhtml)
buyurl = 'https://www.ly.com/scenery/BookSceneryTicket_'+str(sid)+'.html'
SceneryPrices = ticketInfo['SceneryPrices']
# print(title)
slist = {}
for SceneryPrice in SceneryPrices:
types = SceneryPrice['ChannelPriceModelEntityList']
for type in types:
typename = type['ConsumersTypeName']
ChannelPriceEntityList = type['ChannelPriceEntityList']
for ChannelPriceEntity in ChannelPriceEntityList:
ttitle = ChannelPriceEntity['TicketName']
price = ChannelPriceEntity['AmountAdvice']
'''门票详细描述'''
dis = '入园方式:\n'+ChannelPriceEntity['GetTicketMode']
dis = dis + '预订说明:\n'+parse.unquote(ChannelPriceEntity['PriceBookRemark'])
dis = dis + '预订时间:\n'+ChannelPriceEntity['PriceTimeLimit']
dis = dis+'包含项目:\n'+ChannelPriceEntity['ContainedItems']
dis = dis+'退改规则:\n'+ChannelPriceEntity['RefundModifyRule']
ttypename = ChannelPriceEntity['ConsumersTypeName']
booktime = ChannelPriceEntity['BookTime']
dis = dis+'商家信息:\n'+ChannelPriceEntity['SupplierBaseInfo']['SupplierName']
'''数据合并'''
slist.setdefault(ttypename, [])
slist[ttypename].append(
{'name': ttitle, 'type': ttypename, 'price': price, 'url': buyurl,
'buy': '', 'from': '同程旅游', 'isReturnable': '',
'bookTime': booktime, 'outTime': '', 'useTime': '',
'discription': dis})
self.spotsInfo[title] = slist
# print(self.spotsInfo)
# print(price)
except Exception as e:
print(e)
self.done = True