安装依赖
pip install pillow
建立蜘蛛项目
scrapy startproject images_spider
cd images_spider
scrapy genspider five_three_images 53pic.com
settings.py
这个配置把这些整上
LOG_LEVEL = "WARNING"
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 2 # 这个时间得加,这个框架是异步的,得让它慢点
ITEM_PIPELINES = {
'images_spider.pipelines.ImagesSpiderPipeline': 300,
}
# 图片保存路径 -> ImagesPipeline
IMAGES_STORE = 'beautiful_girl'
# 文件保存路径 -> FilesPipeline
FILES_STORE = 'beautiful_girl'
蜘蛛的文件
five_three_images.py
import scrapy
from bs4 import BeautifulSoup
from images_spider.items import ImagesSpiderItem
"""
我用bs熟悉了,所以下面解析用的bs,基本的代码我就不解释了
"""
class FiveThreeImagesSpider(scrapy.Spider):
name = 'five_three_images'
allowed_domains = ['53pic.com']
start_urls = ['https://www.53pic.com/bizhi/meinv/index.html']
def parse(self, response, **kwargs):
soup = BeautifulSoup(response.text, 'lxml')
div_images_list = soup.find('div', attrs={'class': 'work-list-box'}).find_all('div',
attrs={'class': 'card-box'})
for div_images in div_images_list:
images_info = div_images.find('a')
images_href = images_info['href']
images_detail_href = response.urljoin(images_href)
print(f'爬取图片名称:{images_info["title"]},爬取图片url:{images_detail_href}')
yield scrapy.Request(
url=images_detail_href, # scrapy的url拼接
method='get',
callback=self.parse_detail,
)
# 这个地方的逻辑就是让他返回request,按照scrapy的框架的走法,让他绕一圈,最后让他回调到下面那个parse_detail函数上
# break # 测试爬第1页1个,就测试爬第2页第1个,测试爬第3页第1个,然后退出
"""
这下面的逻辑就是翻页,从下一页这个标签去找href,计算出页数,然后就可以控制抓取几页的图片,(也可以猛一点,只要有next_page就一直抓,直到没有为止),后边找出页面,还是让他返回request,交给自己 parse 这个函数,我这里没写请求的url ,可以优化加上
关于Request()的参数:
url: 请求地址
method: 请求方式
callback: 回调函数
errback: 报错回调
dont_filter: 默认False, 表示"不过滤", 该请求会重新进行发送
headers: 请求头.
cookies: cookie信息
"""
next_page = soup.find('a', text='下一页')['href']
page_number = next_page.split('/')[-1].split('.')[0].split('_')[1]
next_url = response.urljoin(next_page)
if int(page_number) <= 3: # 这样可以控制页数就爬3页
yield scrapy.Request(
url=next_url, # scrapy的url拼接
method='get',
callback=self.parse,
)
"""
这个函数没啥,就是解析图片信息,返回item,我们再造一个item
"""
def parse_detail(self, response, **kwargs):
soup = BeautifulSoup(response.text, 'lxml')
images_source_title = soup.find('h2').text
images_detail_info = soup.find('div', attrs={'class': 'cl mbv'}).find('img')
images_source_src = images_detail_info['src']
images_items = ImagesSpiderItem()
images_items['images_source_title'] = images_source_title
images_items['images_source_src'] = images_source_src
yield images_items
items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class ImagesSpiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
images_source_title = scrapy.Field()
images_source_src = scrapy.Field()
images_path = scrapy.Field() #这个后边会用到
最后就是流水线了
pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
import scrapy
from itemadapter import ItemAdapter
from scrapy.pipelines.images import ImagesPipeline
from images_spider import settings
# 需要scrapy的管道帮我完成一些事儿. 必须有scrapy的管道来参与一些动能
class ImagesSpiderPipeline(ImagesPipeline): # 继承
# def process_item(self, item, spider):
# return item
def get_media_requests(self, item, info): # 给url
images_url = item['images_source_src']
"""
# 如果一次性多个图片.可以考虑用循环 + yield返回多次请求
# 请求只要发出去即可. 不用管返回, ImagesPipeline自动保存文件
# meta 这个东西,就可以把item这个东西一直给传下去,没有这个图片的信息会传不下去
"""
yield scrapy.Request(url=images_url, meta={"item": item})
def file_path(self, request, response=None, info=None, *, item=None): # 给路径
item = request.meta["item"]
images_title = item['images_source_title']
return f'{images_title}.jpg'
"""
# 最后的问题.
# 其他的管道里面可能会用到图片的保存路径
"""
def item_completed(self, results, item, info):
status, r = results[0]
file_path = settings.IMAGES_STORE + '/' + r['path']
item['images_path'] = file_path
return item
启动爬虫
scrapy crawl five_three_images
最后的结果,我这里就简单展示一下