景点门票数据的整合和组织
主要实现的是景点门票的后四个平台(去哪儿、同程、途牛、携程)的数据的整合。
去哪儿:
def search_spots(self, keyword, city):
'''
查询景点门票,核心方法
:param keyword:
:param city:
:return:
'''
Ncity = city.replace('市', '').replace('县', '').replace('省', '')
'''如果获取失败,则重试一次'''
try:
url = 'https://piao.qunar.com/ticket/list.htm?keyword='+Ncity+keyword+'®ion='+self.getCityUrl(city)+'&from=mpl_search_suggest&page='
except:
try:
url = 'https://piao.qunar.com/ticket/list.htm?keyword=' + Ncity + keyword + '®ion=' + self.getCityUrl(
city) + '&from=mpl_search_suggest&page='
except:
return
p = 1
try:
while (1):
html = self.getHtml(url + str(p))
soup = BeautifulSoup(html, "html.parser")
div = soup.find_all('a', attrs={'data-click-type': 'l_title','class':'name'})
if len(div) == 0:
break
for s in div:
# 景点名称,门票(名称name,类别type,价格price,url,已售buy,旅行社from,可退isReturnable,预定时间bookTime,出票时间outTime,
# 可用时间useTime,说明discription)
title = s.string
result = fuzz.token_sort_ratio(title, keyword)
if result <= 20:
continue
href = 'https://piao.qunar.com'+s['href']
dis = []
id = self.get_ticket_id(href)
u = 'https://piao.qunar.com/ticket/detail/getTickets.json?sightId=455895&from=detail&supplierId='
u = u.replace('455895', id)
dicts = json.loads(self.getHtml(u))
ls = dicts['data']['groups']
slist = {}
'''解析字典类型的数据'''
for l in ls:
if l['typeName']=='门票':
infos = l['shelfVos']
for info in infos:
tickets = info['firstLvTicketTypeVos']
for ticket in tickets:
types = ticket['ticketTypes']
for type in types:
typeName = type['typeName']
tis = type['tickets']
for ti in tis:
'''这是最小的一条门票的数据'''
ttitle = ti['title']
price = ti['qunarPrice']
isreturn = ti['canRefundType']
buyurl = ti['bookingUrl']
fromw = '去哪儿网 '+ti['supplierName']
booktime = ti['bookAtAnyTimeStr']
dis = ''
'''数据合并'''
slist.setdefault(typeName, [])
slist[typeName].append(
{'name': ttitle, 'type': typeName, 'price': price, 'url':buyurl,
'buy': '', 'from': fromw, 'isReturnable': isreturn,
'bookTime': booktime, 'outTime': '', 'useTime': '',
'discription': dis})
self.spotsInfo[title] = slist
# print(self.spotsInfo)
# self.tickets.append(dis)
# self.spots.append([title,href])
p = p + 1
except:
traceback.print_exc()
self.done = True
# print(self.tickets)
# print(self.spots)
同程:
def search_spots(self, keyword, city):
'''
:param keyword: 关键词
:param city: 在哪所城市搜索
:return:
'''
Ncity = city.replace('市', '').replace('县', '').replace('省', '')
url = 'https://so.ly.com/scenery?q='+Ncity+keyword
p = 1
try:
html = self.getHtml(url)
soup = BeautifulSoup(html,"html.parser")
divs = soup.find_all('div',{'class':'list_l'})
for div in divs:
sid = div.find('div',{'class':'s_info'})['sid']
title = div.find('a',{'class':'sce_name goFinal'})['title']
'''筛选无关信息'''
result = fuzz.token_sort_ratio(title, keyword)
if result <= 20:
continue
detailurl = 'https://so.ly.com/scenery/AjaxHelper/SceneryPriceFrame.aspx?action=GETNEWFRAMEFORLIST&showCount=2&ids='+str(sid)+'&isSimple=1&priceList=1&tabself=1&tabHotel=1&isGrap=1&nobookid=&isyry=1&YpState=1&lon=0&lat=0&isforegin=0&isNewSearch=true&iid=0.398138641670454'
detailhtml = self.getHtml(detailurl)
ticketInfo = json.loads(detailhtml)
buyurl = 'https://www.ly.com/scenery/BookSceneryTicket_'+str(sid)+'.html'
SceneryPrices = ticketInfo['SceneryPrices']
# print(title)
slist = {}
for SceneryPrice in SceneryPrices:
types = SceneryPrice['ChannelPriceModelEntityList']
for type in types:
typename = type['ConsumersTypeName']
ChannelPriceEntityList = type['ChannelPriceEntityList']
for ChannelPriceEntity in ChannelPriceEntityList:
ttitle = ChannelPriceEntity['TicketName']
price = ChannelPriceEntity['AmountAdvice']
'''门票详细描述'''
dis = '入园方式:\n'+ChannelPriceEntity['GetTicketMode']
dis = dis + '预订说明:\n'+parse.unquote(ChannelPriceEntity['PriceBookRemark'])
dis = dis + '预订时间:\n'+ChannelPriceEntity['PriceTimeLimit']
dis = dis+'包含项目:\n'+ChannelPriceEntity['ContainedItems']
dis = dis+'退改规则:\n'+ChannelPriceEntity['RefundModifyRule']
ttypename = ChannelPriceEntity['ConsumersTypeName']
booktime = ChannelPriceEntity['BookTime']
dis = dis+'商家信息:\n'+ChannelPriceEntity['SupplierBaseInfo']['SupplierName']
'''数据合并'''
slist.setdefault(ttypename, [])
slist[ttypename].append(
{'name': ttitle, 'type': ttypename, 'price': price, 'url': buyurl,
'buy': '', 'from': '同程旅游', 'isReturnable': '',
'bookTime': booktime, 'outTime': '', 'useTime': '',
'discription': dis})
self.spotsInfo[title] = slist
# print(self.spotsInfo)
# print(price)
except Exception as e:
print(e)
self.done = True
途牛:
def search_spots(self, keyword, city):
'''
核心方法
:param keyword: 关键词
:param city: 城市
:return:
'''
try:
cityUrl = self.getCityUrl(city)
except:
'''失败则重试一次'''
try:
cityUrl = self.getCityUrl(city)
except:
return
Ncity = city.replace('市','').replace('县','').replace('省','')
url = 'https://s.tuniu.com/search_complex/ticket-'+cityUrl+'-0-'+Ncity+keyword+'/'
p = 1;
try:
while (1):
html = self.getHtml(url + str(p))
soup = BeautifulSoup(html, "html.parser")
div = soup.find('div', {'class': 'thelist'})
if div is None:
break
spot = div.find_all('li')
for s in spot:
slist = {}
title = s.find('dl', {'class': 'detail'}).find('p',{'class':'title ticket'}).text.replace('\n','').replace(' ','')
result = fuzz.token_sort_ratio(title, keyword)
if result <= 20:
continue
'''景点地址'''
address = s.find('dl', {'class': 'detail'}).find('dd').string.replace(u'\u3000',u'')
'''开放时间'''
opentime = s.find('dl', {'class': 'detail'}).find('dd', {'class': 'port'}).text.replace('\n','')
price = s.find('div', {'class': 'priceinfo'}).find('span',{'class':'tnPrice'}).text.replace('\n','').replace(' ','')
'''满意度'''
manyi = s.find('p',{'class': 'manyi_inner'}).text.replace('\n','').replace(' ','')
'''景点链接'''
href = 'https:'+(s.find('div',{'class':'theinfo ticket clearfix'}).find('a',{'class':'img'}))['href']
'''景点图片链接'''
imgsrc = 'https:'+(s.find('div',{'class':'theinfo ticket clearfix'}).find('img'))['data-src']
ticket = s.find('div',{'class': 'ticketlist'}).find_all('div',{'class': 'each-item'})
dis = []
for t in ticket:
'''景点门票处理'''
ticketTitle = t.find('span', {'class': 'ticketTitle'}).text
ticketPrice = t.find('span', {'class': 'tnPrice'}).text
ticketBookUrl = (t.find('form'))['action']
slist.setdefault('', [])
slist[''].append(
{'name': ticketTitle, 'type': '', 'price': ticketPrice, 'url': 'https:'+ticketBookUrl,
'buy': '', 'from': '途牛旅游', 'isReturnable': '',
'bookTime': '', 'outTime': '', 'useTime': '',
'discription': ''})
dis.append([ticketTitle,ticketPrice,ticketBookUrl])
# self.tickets.append(dis)
self.spotsInfo[title] = slist
self.spots[title] = {'title':title,'address':address,'opentime':opentime,'manyi':manyi,'href':href,'imgsrc':imgsrc}
p = p + 1
except:
pass
# print(self.spotsInfo)
# print(self.spots)
self.done = True
携程:
def get_ticket(self):
'''
获取景点门票信息
:return:
'''
id = self.detail_url.split('/')[-1]
# print(id)
ticket_url = f'https://piao.ctrip.com/ticket/dest/{id}?onlyContent=true&onlyShelf=true'
# print(ticket_url)
ticket_res = requests.get(ticket_url, verify=False, headers=self.headers).text
# time.sleep(1)
ticket_res = ticket_res.replace('\n','').replace(' ','')
ticket_res = ticket_res[ticket_res.find('window.__INITIAL_STATE__')+25:ticket_res.find('window.__APP_SETTINGS__')]
info = json.loads(ticket_res)
ticketinfos = info['detailInfo']['ressHash']
slist = {}
for ticketinfo in ticketinfos.values():
'''解析字典数据'''
title = ticketinfo['name']
price = ticketinfo['price']
type = ticketinfo['saleunitinfo']['propleproperty']
fromw = '携程旅游 '+ticketinfo['brandname']
'''数据合并'''
slist.setdefault(type, [])
slist[type].append(
{'name': title, 'type': type, 'price': price, 'url': self.detail_url,
'buy': '', 'from': fromw, 'isReturnable': '',
'bookTime': '', 'outTime': '', 'useTime': '',
'discription': ''})
self.spotsInfo[self.title] = slist
八个平台的数据合并:
def merge2(spi, xc, name):
'''
把两个字典数据合并在一起
:param spi:
:param xc:
:param name:
:return:
'''
if len(name) == 0:
spotsInfo = spi
else:
spotsInfo = {}
for info in spi.keys():
try:
spotsInfo.setdefault(info, {})
k = xc.spotsInfo[info]
if len(name) == 0:
spotsInfo[info][xc.name] = xc.spotsInfo[info]
else:
spotsInfo[info][name] = spi[info]
spotsInfo[info][xc.name] = xc.spotsInfo[info]
xc.spotsInfo.pop(info)
except Exception as e:
# print(e)
if len(name) != 0:
spotsInfo[info][name] = spi[info]
for info in xc.spotsInfo.keys():
spotsInfo.setdefault(info, {})
spotsInfo[info][xc.name] = xc.spotsInfo[info]
# print(spotsInfo)
print('done')
return spotsInfo
def merge(feizhu,xiecheng,tuniu,quna,lvmama,dahe,klook,tongcheng):
'''
把所有的平台的数据都合并在一起
:param feizhu:
:param xiecheng:
:param tuniu:
:param quna:
:param lvmama:
:param dahe:
:param klook:
:param tongcheng:
:return:
'''
list = [feizhu,xiecheng,tuniu,quna,lvmama,dahe,klook,tongcheng]
merged = False
spotsInfo = {}
while len(list)>0:
l = list[0]
if merged:
spotsInfo = merge2(spotsInfo,list.pop(0),'')
else:
spotsInfo = merge2(list.pop(0).spotsInfo,list.pop(0),l.name)
merged = True
# print(spotsInfo)
return spotsInfo