1,spiders 业务处理
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import YmxItem
class SbSpider(scrapy.Spider):
name = 'sb'
base_url = 'https://www.mzitu.com/'
start_urls = [base_url]
# article\/.*\.html
# src="https://i.mmzztt.com/thumb/2020/07/240522_236.jpg"
# rules = (
# # 规则解析器:可以将连接提取器提取到的所有连接表示的页面进行指定规则(回调函数)的解析
# Rule(LinkExtractor(allow='www.mzitu.com/\d+'), callback='parse_item', follow=False),
# )
def parse(self, response):
res = response.xpath('//ul[@id="pins"]/li/a/@href').extract()
for r in res:
yield scrapy.Request(url=r,callback=self.parse_a)
# # //div[@class="pages"]//a[contains(., "下一页")]
def parse_a(self,response):
# img_url = response.xpath('//div[@class="main-image"]//a/img/@src')
# next = response.xpath('//div[@class="pagenavi"]//span[contains(.,"下一页")]')
# if next is not None:
# scrapy.Request(url=)
next = response.xpath('//div[@class="pagenavi"]/a[5]/span/text()').extract_first()
for i in range(1, int(next) + 1):
yield scrapy.Request(url=response.url+ '/'+ str(i),callback=self.scrapy_b)
def scrapy_b(self,response):
item = YmxItem()
item['img_url'] = response.xpath('//div[@class="main-image"]/p/a/img/@src').extract_first() # 匹配图片的url
item['title'] = response.xpath('//div[@class="main-image"]//a/img/@alt').extract_first()
print(item)
yield item
2,容器+管道
class YmxItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
img_url = scrapy.Field()
title = scrapy.Field()
from scrapy.exceptions import DropItem
import scrapy
import time
class YmxPipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None):
url = request.url
# file_name = url.split('/')[-1]
# 防止有中文重名的特意加上时间鹾
return '{}-{}.{}'.format(request.meta['title'], str(time.time()).split('.')[0], url.split('.')[-1])
def item_completed(self, results, item, info):
# results = [(ture,{'url:"asdasdasd","path":"asdasdasd"})]
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem('Image Downloaded Failed')
# item['image_paths'] = '.'.join(image_paths)
return item
def get_media_requests(self, item, info):
# tem参数是爬取生成的item对象,从中提取url字段,然后加入到调用队列中,等待下载。
yield scrapy.Request(item['image_url'], meta={'title': item['title']})
3, settings配置
IMAGES_STORE = 'D:\爬虫数据中心\图片\图片' # 存放路径
4, 中间件配置用户代理 + 代理池
此处省略,自己去配