原理
根据规从页面中提取到“下一页”或“其他分页”链接
用到模块
from pyquery import PyQuery as pq
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
完整代码
# -*- coding: utf-8 -*-
from pyquery import PyQuery as pq
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
class SegSpider(CrawlSpider):
name = "seg"
allowed_domains = ["segmentfault.com"]
start_urls = (
'http://segmentfault.com/t/html5?type=newest&page=1',
)
rules = (
Rule(SgmlLinkExtractor(allow=('\/t\/html5\?type=newest\&page=\d', ))),
Rule(SgmlLinkExtractor(allow=('\/q\/\d+', )), callback='parse_item'),
)
def parse_item(self, response):
html = response.body
v = pq(html)
item = dict()
item['url'] = response.url
item['title'] = v('title').text()
yield item