scrapy

最新推荐文章于 2024-10-22 10:30:53 发布

AuroraPetard

最新推荐文章于 2024-10-22 10:30:53 发布

阅读量177

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/qq_38250124/article/details/79719940

版权

python 专栏收录该内容

23 篇文章 0 订阅

订阅专栏

import scrapy


class CnblSpider(scrapy.Spider):
    name = 'cnbl'
    allowed_domains = ['cnblogs.com']
    start_urls = ['http://cnblogs.com/pick/#p%s' % p for p in range(1, 21)]

    def parse(self, response):
        for blog in response.xpath('//div[@class="post_item"]'):
            print(blog.xpath('div[@class="digg"]/div[@class="diggit"]/span/text()').extract_first())

拼接url ，//div/[@class="post_item"]获得class等于post_item的div extract——first

import scrapy


class JulySpider(scrapy.Spider):
    name = 'july'
    allowed_domains = ['julyedu.com']
    start_urls = ['https://www.julyedu.com/category/index']

    def parse(self, response):
        for julyedu_class in response.xpath('//div[@class="course_info_box"]'):
            print(julyedu_class.xpath('a/h4/text()').extract_first())
            print(julyedu_class.xpath('a/p[@class="course-info-tip"][1]/text()').extract_first())
            print(julyedu_class.xpath('a/p[@class="course-info-tip"][2]/text()').extract_first())
            yield {'title': julyedu_class.xpath('a/h4/text()').extract_first()}

import scrapy


class QuoSpider(scrapy.Spider):
    name = 'quo'
    allowed_domains = ['quotes.toscrape.com']
    start_urls = ['http://quotes.toscrape.com/']

    def parse(self, response):
        for sel in response.xpath('//div[@class="quote"]'):
            print(sel.xpath('span[1]/text()').extract_first())
            print(sel.xpath('span[2]/small/text()').extract_first())
            yield {
                'text': sel.xpath('span[1]/text()').extract_first(),
                'author': sel.xpath('span[2]/small/text()').extract_first()
            }
            # 返回
                
            #     查找下一页按钮
            next_page = response.xpath('//li[@class="next"]/a/@href').extract_first()
            if next_page is not None:
                next_page = response.urljoin(next_page)
            #     拼接完整url，回调下一页面
            yield scrapy.Request(next_page, callback=self.parse)