实现这个是因为之前在谋个公众号里面看到一篇文章,关注了也拿不到源代码 ,所以就自己写了一个爬取这个网站图片的功能。个人觉得这个网站的图片就一般吧。
开始
环境,py3, win, linux下运行都是没有问题的
前期先创建好自己的爬虫啦
配置好配置文件
爬虫文件代码
# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from pa_mzitu.items import PaMzituItem
#
class MydomainSpider(CrawlSpider):
name = 'mydomain'
allowed_domains = ['www.mzitu.com']
start_urls = ['https://www.mzitu.com']
page_links = LinkExtractor(allow=(r"/page/2/"))
page_home = LinkExtractor(allow=(r"https://www.mzitu.com/$"))
# 设置解析link的规则,callback是指解析link返回的响应数据的的方法
rules = [
Rule(link_extractor=page_links, callback="parse_page", follow=True),
Rule(link_extractor=page_home, callback="parse_page", follow=True)
]
def parse_page(self, response):
#提取页码
page_name = re.findall('/page/(\d+)/', response.url)
if page_name:
page_name = "page_%s"%page_name[0]
else:
page_name = "page_1"
# 匹配内页图片连接
xps = response.xpath("//ul[@id='pins']/li/a/@href").extract()
for img_url in xps:
url = img_url
request = scrapy.Request(url, callback=self.parse_item, meta={'page_name': page_name})
yield request
def parse_item(self, response):
#获取下一张图片的连接
next_img_url = response.xpath("//div[@class='pagenavi']/a[last()]/@href").extract()[0]
#分页名称
page_name = response.meta['page_name']
if re.search(r'/\d+/\d+', next_img_url):
yield scrapy.Request(url=next_img_url, callback=self.parse_item, meta={'page_name': page_name})
item = PaMzituItem()
# 文件夹名称
file_name = response.xpath("//div[@class='main-image']/p/a/img/@alt").extract()[0]
# 图片地址
img_url = response.xpath("//div[@class='main-image']/p/a/img/@src").extract()[0]
item['page_name'] = page_name
item['file_name'] = file_name
item['img_url'] = img_url
item['img_referer'] = response.url
yield item
items.py 文件代码
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Item,Field
class PaMzituItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
#图片地址
img_url = Field()
#图片来源地址
img_referer = Field()
#文件夹名称
file_name = Field()
#文件夹分页名称
page_name = Field()
下载中间这里下载图片用的是scrapy官方图片提供的下载类
# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
class PaMzituPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
# 下载图片,如果传过来的是集合需要循环下载
# meta里面的数据是从spider获取,然后通过meta传递给下面方法:file_path
headers = {
"Referer": item['img_referer'],
"User-Agent": " Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
}
yield scrapy.Request(url=item['img_url'], headers=headers, meta={'file_name': item['file_name'], 'page_name': item['page_name']})
def item_completed(self, results, item, info):
# 是一个元组,第一个元素是布尔值表示是否成功
if not results[0][0]:
raise DropItem('下载失败')
return item
# 重命名,若不重写这函数,图片名为哈希,就是一串乱七八糟的名字
def file_path(self, request, response=None, info=None):
# 接收上面meta传递过来的图片名称
page_name = request.meta['page_name']
file_name = request.meta['file_name']
# 提取url前面名称作为图片名
image_name = request.url.split('/')[-1]
# 清洗Windows系统的文件夹非法字符,避免无法创建目录
folder_strip_page = re.sub(r'[?\\*|“<>:/]', '', str(page_name))
folder_strip = re.sub(r'[?\\*|“<>:/]', '', str(file_name))
# 分文件夹存储的关键:{0}对应着name;{1}对应着image_guid
filename = u'{0}/{1}/{2}'.format(folder_strip_page, folder_strip, image_name)
return filename
做好这些之后,运行爬虫即可爬取,报模块错的话就下载依赖的模块就行了。
运行:
scrapy crawl ”爬虫名称“
附上结果图片
代码改天放上github,如果有做的不对,欢迎大佬指教。好了,今天就先到这。