scrapy

环境

Scrapy 2.3.0

python3.7

0.24.6中文入门教程

调试SHELL

分为全局和项目only,cd project 则可用项目only

命令行工具使用

菜鸟教程--xpath使用

#启动
scrapy shell [url|file]

#帮助
scrapy shell -h

#设置
settings.__dict__ #获取所有设置,包括user agent等headers
setting.get('USER_AGENT')
'Scrapy/2.3.0 (+https://scrapy.org)'

#测试糗事百科,糗百会禁止爬虫,增加user_agent。注意参数的名字大小写和下划线。
scrapy shell --set=USER_AGENT="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/40.0.2214.111 Chrome/40.0.2214.111 Safari/537.36" https://www.qiushibaike.com/
>>response
<200 https://www.qiushibaike.com/>
>>response.headers
{b'Server': [b'openresty'],
 b'Date': [b'Mon, 21 Sep 2020 03:16:14 GMT'],
 b'Content-Type': [b'text/html; charset=UTF-8'],
 b'Set-Cookie': [b'_xsrf=2|46b435ed|e701954a6c5afd19da708aac09740f52|1600658174; Path=/'],
 b'Vary': [b'User-Agent, Accept-Encoding'],
 b'Etag': [b'"87f963dd02a197ab0c2cf4f2395d84f9631edaa0"']}
#xpath 返回List
In [1]: response.xpath('//title')
Out[1]: [<Selector xpath='//title' data=u'<title>Open Directory - Computers: Progr'>]

In [2]: response.xpath('//title').extract()
Out[2]: [u'<title>Open Directory - Computers: Programming: Languages: Python: Books</title>']

In [3]: response.xpath('//title/text()')
Out[3]: [<Selector xpath='//title/text()' data=u'Open Directory - Computers: Programming:'>]

In [4]: response.xpath('//title/text()').extract()
Out[4]: [u'Open Directory - Computers: Programming: Languages: Python: Books']

In [5]: response.xpath('//title/text()').re('(\w+):')
Out[5]: [u'Computers', u'Programming', u'Languages', u'Python']
>>response.xpath('//span[@class="next"]/../@href').extract() #获取属性
['/8hr/page/2/']

/html/head/title: 选择HTML文档中 <head> 标签内的 <title> 元素
/html/head/title/text(): 选择上面提到的 <title> 元素的文字
//td: 选择所有的 <td> 元素
//div[@class="mine"]: 选择所有具有 class="mine" 属性的 div 元素
#CSS选择器
response.css('title::text').extract()
response.css('base::attr(href)').extract()
#rebot验证
#有些网站允许爬虫,但是会设定权限,可根据rebot.txt结果查看 user_agent
#scrapy  shell --set=USER_AGENT="Baiduspider"   http://www.sohu.com/a/337634404_100032610 
>>
>>
>>
>>
>>



文字

糗事百科

#settings.py

DEFAULT_REQUEST_HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/40.0.2214.111 Chrome/40.0.2214.111 Safari/537.36"
}
ITEM_PIPELINES = {
   'study.pipelines.StudyPipeline': 300, # 权重值越小,越优先执行!
}

#pipeliens.py
#重写管道,scrapy会按照顺序调用
from itemadapter import ItemAdapter
class StudyPipeline:
    def __init__(self):
        self.f = open('./ret.txt', 'w')

    def open_spider(selfs, spider):
        print('start!')
    def process_item(self, item, spider):
        txt=item['author']+'\t'+item['body']+'\t'+\
            item['laugh']+' laughs'+'\t'+\
            item['comments']+' commnets'+'\n'
        self.f.write(txt)

        return item
    def close_spider(self,spider):
        self.f.close()
        print("over!")
#import  scrapy
from study.items import QiuShiItem

#qiushi_spider.py
class QiuShi(scrapy.Spider):
    name='qiushibaike'
    allowed_domains=['qiushibaike.com']

    start_urls=['http://www.qiushibaike.com']


    def parse(self, response):
        item = QiuShiItem()
        for sel in response.xpath('//div[@class="recmd-right"]'):
            item['author'] =sel.xpath('.//span[@class="recmd-name"]/text()').extract()[0]
            item['body'] = sel.xpath('./a/text()').extract()[0]
            item['laugh']= sel.xpath('.//div[@class="recmd-num"]/span[1]/text()').extract()[0]
            item['comments']=sel.xpath('.//div[@class="recmd-num"]/span[last()-1]/text()').extract()[0]
            # item=QiuShi.encodeItem(item)
            yield item
        next_page=response.xpath('//span[@class="next"]/../@href').extract()[0]
        # print(next_page)
        if next_page:
            next=self.start_urls[0] +next_page
            yield scrapy.Request(next,callback=self.parse)
        else:
            return

图片

默认保存的名字用md5/sha1等hash算法计算出来,可重写file_path函数

#settings.py
ITEM_PIPELINES = {
   'pictest.pipelines.PictestPipeline': 300,
}
IMAGES_STORE='./pics' #
IMAGES_URLS_FIELD='pic_url'#item里字段名称修改
USER_AGENT = 'Baiduspider'#souhu允许百度爬虫

#pipelines.py
from itemadapter import ItemAdapter
from scrapy.pipelines.images import ImagesPipeline
from scrapy.http import Request
from scrapy.exceptions import DropItem

class PictestPipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        for url in item['pic_url']:
            yield Request(url)

    def item_completed(self, results, item, info) :
        image_path = [x['path'] for ok, x in results if ok]
        if not image_path :
            raise DropItem('Item contains no images')
        item['image_paths'] = image_path
        return item
    #def file_path(self, request, response=None, info=None):
    #    PictestPipeline.pic_name+=1
    #   return (str(PictestPipeline.pic_name)+'.jpg')

#results >>(True, {'url': 'http://5b0988e595225.cdn.sohucs.com/images/20190830/1780022790ac42c4939c6ba7774cd9ed.jpeg', 'path': 'full/80db58946bd9c706b959529b2140e8e8fecc0360.jpg', 'checksum': '209e953b433db95eb84ed51650117ebc', 'status': 'downloaded'})

#items.py
import scrapy
class PictestItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pic_url=scrapy.Field()#可以自定义字段名字,默认为 ‘image_urls’,settings设置
    images_result_field=scrapy.Field()#默认为‘images_result_field’
    image_pathd=scrapy.Field()

#pic_spider.py
import  scrapy
from pictest.items import PictestItem
class Pic(scrapy.Spider):
    name='pictest'
    allowed_domains = ['sohu.com']
    start_urls = ['http://www.sohu.com/a/337634404_100032610']
    def parse(self, response):
        item = PictestItem()
        item['pic_url']=response.xpath('//article//img/@src').extract()
        yield item

双管道

1.用scrapy框架的时候,一定要先明白执行的顺序,代码已写好,程序开始运行~

  • SPIDERS的yeild将request发送给ENGIN
  • ENGINE对request不做任何处理发送给SCHEDULER
  • SCHEDULER( url调度器),生成request交给ENGIN
  • ENGINE拿到request,通过MIDDLEWARE进行层层过滤发送给DOWNLOADER
  • DOWNLOADER在网上获取到response数据之后,又经过MIDDLEWARE进行层层过滤发送给ENGIN
  • ENGINE获取到response数据之后,返回给SPIDERS,SPIDERS的parse()方法对获取到的response数据进行处理,解析出items或者requests
  • 将解析出来的items或者requests发送给ENGIN
  • ENGIN获取到items或者requests,将items发送给ITEMPIPELINES,将requests发送给SCHEDULER
    注意!只有当调度器中不存在任何request了,整个程序才会停止,(也就是说,对于下载失败的URL,Scrapy也会重新下载。)
#main.py 调试模式
#!/usr/bin/env python
#-*- coding:utf-8 -*-

from scrapy.cmdline import execute
import os
import sys

#添加当前项目的绝对地址
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
execute(['scrapy', 'crawl', 'douban250'])

#settings.py

USER_AGENT="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/40.0.2214.111 Chrome/40.0.2214.111 Safari/537.36"
# Obey robots.txt rules 豆瓣禁止爬虫访问图片
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
   'douban250.pipelines.Douban250Pipeline': 100,
   'douban250.pipelines.Douban250AvatarPipeline': 200,
}
DOWNLOAD_DELAY = 5 #每次request的间隔
IMAGES_STORE='./pic'

#items.py
import scrapy
class Douban250Item(scrapy.Item):
    # define the fields for your item here like:
    name = scrapy.Field()
    director=scrapy.Field()
    kinds=scrapy.Field()
    time=scrapy.Field()
    detail=scrapy.Field()
    intro=scrapy.Field()
    score=scrapy.Field()
    persons=scrapy.Field()
    image_urls=scrapy.Field()
    image_path=scrapy.Field()

#pipelines.py
from itemadapter import ItemAdapter
from scrapy.pipelines.images import  ImagesPipeline
from scrapy.http import Request
from scrapy.exceptions import DropItem
from  .settings import IMAGES_STORE

class Douban250Pipeline:

    def process_item(self, item, spider):

        fileName='%s/%s.txt'%(IMAGES_STORE,item['name'])
        txt="{} \n{}\n类型:{}\n简介:{}\n{}分  {}\n{}"\
            .format(item['name'],item['director'].strip(),item['kinds'].strip(),
                    item['detail'],item['score'],
                    item['persons'],item['intro'].strip())
        with open(fileName,'w') as f:
            f.write(txt)
        return item


class Douban250AvatarPipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
       url=item['image_urls']

       yield Request(url,cb_kwargs={'name':item['name']})

    def item_completed(self, results, item, info):
        images_path = [x['path'] for ok,x in results if ok]
        if not images_path:
            raise DropItem("item contains no images")
        item['image_path'] = images_path
        return item

    def file_path(self, request, response=None, info=None):
        return '%s.jpg'%(request.cb_kwargs['name'])



#douban_spider.py
from scrapy import Spider,Request
from douban250.items import Douban250Item

class DouBan(Spider):
    name = 'douban250'
    start_urls = ['https://movie.douban.com/top250?start=50&filter=']
    def parse(self, response, **kwargs):
        item=Douban250Item()
        for one in response.xpath('//div[@class="item"]'):
            item['name']=one.xpath('.//div[@class="pic"]/a/img/@alt').extract_first()
            item['image_urls']=one.xpath('.//div[@class="pic"]/a/img/@src').extract_first()
            item['director'],item['kinds']= one.xpath('.//div[@class="bd"]/p[@class=""]/text()').extract()
            # item['time']=''
            item['detail']= one.xpath('.//span[@class="inq"]/text()').extract_first()
            item['score'], item['persons']= one.xpath('.//div[@class="star"]/span/text()').extract()
            # item['persons']=one.xpath('.//div[@class="star"]/span/text()')[1::2].extract()
            intro_link=one.xpath('.//div[@class="pic"]/a/@href').extract_first()

            # yield会把Request给SCHEDULER-ENGIN-MIDDLEWARE-DOWNLOADER-response
            #MIDDLEWARE-ENGINE-SPIDERS(parse_intro)-ENGIN-items-ITEMPIPELINES

            yield Request(intro_link,callback=self.parse_intro,meta=item)
        next_page=response.xpath('//span[@class="next"]/a/@href').extract_first()
        # print(next_page)
        if next_page:
            
            yield Request(response.urljoin(next_page),callback=self.parse)
        else:
            return
    def parse_intro(self,response,**kwargs):
        item=response.meta
        item['intro']=response.xpath('//span[@property="v:summary"]/text()').extract_first()
        yield item





 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值