scrapy爬虫--10分钟入门

最新推荐文章于 2024-09-27 10:11:28 发布

weixin_30530523

最新推荐文章于 2024-09-27 10:11:28 发布

阅读量68

点赞数

文章标签： python 爬虫开发工具

原文链接：http://www.cnblogs.com/wujf-myblog/p/10815321.html

版权

# -*- coding: utf-8 -*-
# @Time    : 2019/4/18 9:10
# @Author  : wujf
# @Email   : 1028540310@qq.com
# @File    : mingyan_spider.py
# @Software: PyCharm

import scrapy

class mingyan(scrapy.Spider):
    #name = "mingyan2"

    # def start_requests(self):
    #
    #     #定义爬取的链接
    #     urls = [
    #         'http://lab.scrapyd.cn/page/1/',
    #         'http://lab.scrapyd.cn/page/2/'
    #     ]
    #
    #     for url in urls:
    #         yield scrapy.Request(url=url,callback=self.parse)

    #另外一种写法
    name = 'itemSpider'
    start_urls = [
        'http://lab.scrapyd.cn'
    ]

    def parse(self, response):
        mingyan = response.css('div.quote')
        for v in mingyan:

            text    = v.css('.text::text').extract_first()
            author  = v.css('.author::text').extract_first()
            tags    = v.css('.tags .tag::text').extract()
            tags    = ','.join(tags)
            fileName= '%s-语录.txt'%author
            with open(fileName,'a+') as f:
                f.write(text+'\n'+'标签：'+tags)

        next_page = response.css('li.next a::attr(href)').extract_first()
        if next_page is not None:
            next_page = response.urljoin(next_page)   #urljoin(next_page)把相对路径，如：page/1转换为绝对路径，其实也就是加上网站域名
            yield scrapy.Request(next_page,callback=self.parse)  #yield 生成器一般


        # page = response.url.split('/')[-2]
        # filename = 'mingyan-%s.html'%page
        #
        # with open(filename,'wb') as f:
        #     f.write(response.body)
        # self.log('保存文件：%s'%filename)

转载于:https://www.cnblogs.com/wujf-myblog/p/10815321.html