import scrapy import re from scrapy.http import Request from urllib import parse class JobboleSpider(scrapy.Spider): name = 'jobbole' allowed_domains = ['blog.jobbole.com'] start_urls = ['http://python.jobbole.com/all-posts/'] def parse(self, response): """ 1. 解析文章列表页中的文章url并且交给scrapy下载后完成解析 2. 获取下一页的url并交给scrapy下载,完成后交给parse """ # 获取当前页面所有文章的url并且交给scrapy进行下载 post_nodes = response.css("#archive .floated