import scrapy class CnblSpider(scrapy.Spider): name = 'cnbl' allowed_domains = ['cnblogs.com'] start_urls = ['http://cnblogs.com/pick/#p%s' % p for p in range(1, 21)] def parse(self, response): for blog in response.xpath('//div[@class="post_item"]'): print(blog.xpath('div[@class="digg"]/div[@class="diggit"]/span/text()').extract_first())
拼接url ,//div/[@class="post_item"]获得class等于post_item的div extract——first
import scrapy class JulySpider(scrapy.Spider): name = 'july' allowed_domains = ['julyedu.com'] start_urls = ['https://www.julyedu.com/category/index'] def parse(self, response): for julyedu_class in response.xpath('//div[@class="course_info_box"]'): print(julyedu_class.xpath('a/h4/text()').extract_first()) print(julyedu_class.xpath('a/p[@class="course-info-tip"][1]/text()').extract_first()) print(julyedu_class.xpath('a/p[@class="course-info-tip"][2]/text()').extract_first()) yield {'title': julyedu_class.xpath('a/h4/text()').extract_first()}
import scrapy class QuoSpider(scrapy.Spider): name = 'quo' allowed_domains = ['quotes.toscrape.com'] start_urls = ['http://quotes.toscrape.com/'] def parse(self, response): for sel in response.xpath('//div[@class="quote"]'): print(sel.xpath('span[1]/text()').extract_first()) print(sel.xpath('span[2]/small/text()').extract_first()) yield { 'text': sel.xpath('span[1]/text()').extract_first(), 'author': sel.xpath('span[2]/small/text()').extract_first() } # 返回 # 查找下一页按钮 next_page = response.xpath('//li[@class="next"]/a/@href').extract_first() if next_page is not None: next_page = response.urljoin(next_page) # 拼接完整url,回调下一页面 yield scrapy.Request(next_page, callback=self.parse)