#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-09-10 13:39:22
# Project: qunaer
from pyspider.libs.base_handler import *
class Handler(BaseHandler):
crawl_config = {
}
@every(minutes=24 * 60)
def on_start(self):
self.crawl('http://travel.qunar.com/travelbook/list.htm', callback=self.index_page)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('li > .tit > a').items():
self.crawl(each.attr.href, callback=self.detail_page, fetch_type="js")
next = response.doc(".next").attr.href
self.crawl(next, callback=self.index_page)
@config(priority=2)
def detail_page(self, response):
costing = response.doc("li.f_item.howmuch > p > span.data").text() + "元"
title = response.doc("#booktitle").text()
days = response.doc(" li.f_item.howlong > p > span.data").text() + "天"
departure_date = response.doc(" li.f_item.when > p > span.data").text()
cover_img = response.doc(".cover_img").attr.src
text = response.doc(".imglst").text()
return {
"cover_img"
"url": response.url,
"标题": title,
"人均费用": costing,
"天数": days,
"出发日期": departure_date,
"封面": cover_img,
"正文": text
}