驴妈妈旅游网站景点门票的爬取
通过解析网站得出,驴妈妈网站并没有传输的json格式的数据,所以采用的方式是直接爬取网页源代码。
爬取的网页地址是:
http://s.lvmama.com/ticket/H9K110000?keyword=‘+keyword+’&tabType=route#list
其中keyword是想要查询的关键词。
爬取该网页获取结果列表,结果包括景点的名称和详细url,紧接着进入到详细页面,解析源码,找到对应信息的标签获取即可。
def search_spots(self, keyword, city):
'''
:param keyword: 关键词
:param city: 在哪所城市搜索
:return:
'''
Ncity = city.replace('市', '').replace('县', '').replace('省', '')
'''精确搜索结果'''
url = 'http://s.lvmama.com/ticket/H9K110000?keyword='+Ncity+keyword+'&tabType=route#list'
# p = 1;
try:
html = self.getHtml(url)
html = html[:html.find('以下为您推荐')]
soup = BeautifulSoup(html,"html.parser")
scenics = soup.find_all('div',{'class':'product-regular clearfix'})
'''保存的是某一景点的数据'''
list = {}
for scenic in scenics:
slist = {}
a = scenic.find('a',{'class':'product-picture'})
result = fuzz.token_sort_ratio(a['title'], keyword)
if result<=20:
continue
detail = self.getHtml(a['href'])
detail_soup = BeautifulSoup(detail, "html.parser")
tables = detail_soup.find_all('table',{'class':'ptable table-full'})
r = re.compile(r'product_id: "')
g = ''
m = r.search(detail)
'''提取id'''
if m:
endpos = m.end()
while detail[endpos] != '\"':
g = g + detail[endpos]
endpos = endpos + 1
productid = g
for table in tables:
items = table.find_all('dl',{'class':'ptditem'})
i = 0
for item in items:
if i == 0:
i = i+1
continue
'''价格'''
price = item.find('dd',{'class':'pdlvprice'}).find('i').string.replace(' ','').replace('\n','').replace('\t','')
'''预定描述'''
try:
booktime = item.find('dd',{'class':'pdAdvbookingTime'}).string.replace(' ','').replace('\n','').replace('\t','')
except:
booktime = ''
title = item.find('dt',{'class':'pdname'}).text.replace(' ','').replace('\n','').replace('\t','')
# isreturn = item.find('dt',{'class':'pdname'}).find('span',{'class':'tagsback'})#是否可退
# isget = item.find('dt',{'class':'pdname'}).find('span',{'class':'tagsback'})#是否需要取票
try:
type = item.find('dt',{'class':'pdname'}).find('a')['key'].replace(' ','').replace('\n','').replace('\t','')
data = item.find('dt',{'class':'pdname'}).find('a')['data'].replace(' ','').replace('\n','').replace('\t','')
except Exception as ex:
type = ''
data = ''
detail_url = 'http://ticket.lvmama.com/scenic_front/scenic/asyncLoadingTicketDetail.do?suppGoodsId='+data+'&branchType=&productId='+productid+'&bizCategoryId=11&key='
moreInfo = self.getHtml(detail_url)
# print(moreInfo)
moresoup = BeautifulSoup(moreInfo,'html.parser')
h4 = moresoup.find_all('li')
dis = ''
isreturn = ''
for h in h4:
# lis = h.find_all('li')
# for li in lis:
if h.text.find('退票规则')>=0:
isreturn = h.find('i').text
if h.text.find('有效期限')>=0:
usetime = h.text.replace('有效期限','')
dis = moresoup.text.replace('收起','')
# print(detail_url)
# print(title)
'''合并'''
slist.setdefault(type,[])
slist[type].append({'name':title,'type':type,'price':price,'url':a['href'],'buy':'','from':'驴妈妈','isReturnable':isreturn,
'bookTime':booktime,'outTime':'','useTime':usetime,'discription':dis})
list[a['title']] = slist
# print(list)
self.spotsInfo = list
except Exception as e:
print(e)
self.done = True
通过上述方法,就可以实现通过关键词和城市查询出景点门票的功能。