pythonwindows管道_python文件管道下载图集

最新推荐文章于 2023-05-22 22:46:21 发布

weixin_39843698

最新推荐文章于 2023-05-22 22:46:21 发布

阅读量65

点赞数

文章标签： pythonwindows管道

# -*- coding: utf-8 -*-

import re

from time import sleep

import scrapy

from scrapy.linkextractors import LinkExtractor

from scrapy.spiders import CrawlSpider, Rule

class AngelSpider(CrawlSpider):

name = 'angel'

allowed_domains = ['angelimg.spbeen.com']

start_urls = ['http://angelimg.spbeen.com/']

base_url = "http://angelimg.spbeen.com"

rules = (

Rule(LinkExtractor(allow=r'^http://angelimg.spbeen.com/ang/\d+$'), callback='parse_item', follow=False),

)

def parse_item(self, response):

print(response.url)

item = response.meta.get('item',False)

if item:

pass

else:

item = {}

item['files'] = []

item['file_urls'] = []

dir_name = response.xpath('.//div[@class="article"]/h2/text()').extract_first()

item['dir_name'] = dir_name.split('【')[0]

item['dir_name'] = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])","", item['dir_name'])

img_url = response.xpath('.//div[@id="content"]/a/img/@src').extract_first()

item['file_urls'].append(img_url)

# 如果有下一页请求下一页，没有数据丢回管道

next_url = response.xpath('.//div[@class="page"]//a[contains(@class,"next")]/@href').extract_first()

#sleep(1)

if next_url:

next_url = self.base_url + next_url

yield scrapy.Request(next_url,callback=self.parse_item,meta={'item':item})

else:

yield item

管道继承文件管道

# -*- coding: utf-8 -*-

# Define your item pipelines here

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import hashlib

import os

from scrapy.pipelines.files import FilesPipeline

class AngelimgPipeline(object):

def process_item(self, item, spider):

return item

from scrapy.http import Request

from scrapy.utils.python import to_bytes

class DealFilePathPipeline(FilesPipeline):

def get_media_requests(self, item, info):

return [Request(x,meta={'item':item}) for x in item.get(self.files_urls_field, [])]

def file_path(self, request, response=None, info=None):

## start of deprecation warning block (can be removed in the future)

def _warn():

from scrapy.exceptions import ScrapyDeprecationWarning

import warnings

warnings.warn('FilesPipeline.file_key(url) method is deprecated, please use '

'file_path(request, response=None, info=None) instead',

category=ScrapyDeprecationWarning, stacklevel=1)

# check if called from file_key with url as first argument

if not isinstance(request, Request):

_warn()

url = request

else:

url = request.url

# detect if file_key() method has been overridden

if not hasattr(self.file_key, '_base'):

_warn()

return self.file_key(url)

## end of deprecation warning block

item = request.meta.get('item',{})

media_guid = hashlib.sha1(to_bytes(url)).hexdigest() # change to request.url after deprecation

media_ext = os.path.splitext(url)[1] # change to request.url after deprecation

print(item)

return 'full2/{}/{}{}'.format(item['dir_name'],media_guid, media_ext)

return 'full/%s%s' % (media_guid, media_ext)

# deprecated

def file_key(self, url):

return self.file_path(url)

file_key._base = True

setting.py

#-*- coding: utf-8 -*-

#Scrapy settings for angelImg project#

#For simplicity, this file contains only settings considered important or#commonly used. You can find more settings consulting the documentation:#

#https://doc.scrapy.org/en/latest/topics/settings.html#https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME= 'angelImg'SPIDER_MODULES= ['angelImg.spiders']

NEWSPIDER_MODULE= 'angelImg.spiders'

#Crawl responsibly by identifying yourself (and your website) on the user-agent#USER_AGENT = 'angelImg (+http://www.yourdomain.com)'

#Obey robots.txt rules

ROBOTSTXT_OBEY =False#Configure maximum concurrent requests performed by Scrapy (default: 16)#CONCURRENT_REQUESTS = 32

#Configure a delay for requests for the same website (default: 0)#See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay#See also autothrottle settings and docs#DOWNLOAD_DELAY = 3#The download delay setting will honor only one of:#CONCURRENT_REQUESTS_PER_DOMAIN = 16#CONCURRENT_REQUESTS_PER_IP = 16

#Disable cookies (enabled by default)#COOKIES_ENABLED = False

#Disable Telnet Console (enabled by default)#TELNETCONSOLE_ENABLED = False

#Override the default request headers:

DEFAULT_REQUEST_HEADERS ={#'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

#'Accept-Language': 'en',

"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36","Referer":"http://angelimg.spbeen.com/"}#Enable or disable spider middlewares#See https://doc.scrapy.org/en/latest/topics/spider-middleware.html#SPIDER_MIDDLEWARES = {#'angelImg.middlewares.AngelimgSpiderMiddleware': 543,#}

#Enable or disable downloader middlewares#See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#DOWNLOADER_MIDDLEWARES = {#'angelImg.middlewares.AngelimgDownloaderMiddleware': 543,#}

#Enable or disable extensions#See https://doc.scrapy.org/en/latest/topics/extensions.html#EXTENSIONS = {#'scrapy.extensions.telnet.TelnetConsole': None,#}

#Configure item pipelines#See https://doc.scrapy.org/en/latest/topics/item-pipeline.html

ITEM_PIPELINES ={#'angelImg.pipelines.AngelimgPipeline': 300,

'angelImg.pipelines.DealFilePathPipeline': 200,#'scrapy.pipelines.files.FilesPipeline': 2

}

FILES_STORE='file_doload'

#Enable and configure the AutoThrottle extension (disabled by default)#See https://doc.scrapy.org/en/latest/topics/autothrottle.html#AUTOTHROTTLE_ENABLED = True#The initial download delay#AUTOTHROTTLE_START_DELAY = 5#The maximum download delay to be set in case of high latencies#AUTOTHROTTLE_MAX_DELAY = 60#The average number of requests Scrapy should be sending in parallel to#each remote server#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0#Enable showing throttling stats for every response received:#AUTOTHROTTLE_DEBUG = False

#Enable and configure HTTP caching (disabled by default)#See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings#HTTPCACHE_ENABLED = True#HTTPCACHE_EXPIRATION_SECS = 0#HTTPCACHE_DIR = 'httpcache'#HTTPCACHE_IGNORE_HTTP_CODES = []#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'