图片下载的一个爬虫demo
spider
import scrapy
from scrapydownloader.items import ScrapydownloaderItem
class DownloaderSpider(scrapy.Spider):
name = 'downloader'
# allowed_domains = ['xxx.com']
start_urls = ['https://sc.chinaz.com/tupian/']
def parse(self, response):
# self.logger.error(response.text)
self.logger.error('Status Code: ' + str(response.status))
img_list = response.xpath('//div[@class="container"]/div[2]/div')
# print(img_list)
for img in img_list:
item = ScrapydownloaderItem()
'''
图片的懒加载, 当图片被滑动到可视化区域的时间 ,
src的地址才会变成可以请求的地址 ,那些未加载出来的照片是不能被请求的
这里使用的伪属性
'''
img_url = img.xpath("./img/@data-original").extract_first()
img_name = img.xpath("./img/@alt").extract_first()
print("img_url :%s" %str(img_url))
print("img_name: %s" % img_name)
item['img_name'] =img_name
item['img_src'] ="https:"+str(img_url)
# yield scrapy.Request(url=img_url,callback=self.downdetail,meta={'item':item})
# yield scrapy.Request(url=img_url,callback=self.downdetail,meta={'item':item})
yield item
items也贴出来
import scrapy
class ScrapydownloaderItem(scrapy.Item):
# define the fields for your item here like:
# img_name = scrapy.Field()
# img_downloadurl = scrapy.Field()
img_src = scrapy.Field()
img_name = scrapy.Field()
pass
pipelines
from scrapy.pipelines.images import ImagesPipeline
import scrapy
from scrapy.exceptions import DropItem
class ImgsPipLine(ImagesPipeline):
# 下载图片 ,meta是从spider获取的
def get_media_requests(self, item, info):
yield scrapy.Request(url=item['img_src'])
# 返回图片名称即可 ,重命名,若不重写这函数,图片名为哈希,就是一串乱七八糟的名字
def file_path(self, request, response=None, info=None):
imgname = request.url.split('/')[-1]
print("imgname :%s" %imgname)
return imgname
# 返回item,将其返回给下一个即将被执行的管道类
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
print("image_paths: %s" %image_paths)
if not image_paths:
raise DropItem("Item contains no images")
return item
Middleware 中添加了 UA的代理和IP代理的伪装
ip地址池是爬取的免费的代理,存储到redis中
import random
# UserAgent 随机获取
'''
Scrapy内置的Downloader Middleware为Scrapy提供了基础的功能,
但在项目实战中我们往往需要单独定义Downloader Middleware。不用担心,这个过程非常简单,我们只需要实现某几个方法即可。
每个Downloader Middleware都定义了一个或多个方法的类,核心的方法有如下三个。
process_request(request, spider)。
process_response(request, response, spider)。
process_exception(request, exception, spider)。
'''
class RandomUserAgentMiddleware():
def __init__(self):
self.user_agents = [
'Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0 Safari/537.2',
'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:15.0) Gecko/20100101 Firefox/15.0.1',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2919.83 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2866.71 Safari/537.36',
'Mozilla/5.0 (X11; Ubuntu; Linux i686 on x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2820.59 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv,2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2762.73 Safari/537.36'
]
# 发送请求
def process_request(self, request, spider):
# 在请求页面时伪装成站内请求,用以反 反爬虫
referer = request.url
if referer:
request.headers['referer'] = referer
request.headers['User-Agent'] = random.choice(self.user_agents)
#请求后
def process_response(self, request, response, spider):
# response.status = 201
return response
import requests
'''
ip 代理使用的是docker中的随机代理
'''
class IpProxyDownloaderMiddleware:
def process_request(self,request,spider):
pro_addr = requests.get('http://192.168.0.102:5555/random').text
request.meta['proxy'] = 'http://' + pro_addr
settings的配置
ROBOTSTXT_OBEY = False
DOWNLOADER_MIDDLEWARES = {
'scrapydownloader.middlewares.RandomUserAgentMiddleware': 543,
#如果要使用代理 ,打开就可以
# 'scrapydownloader.middlewares.IpProxyDownloaderMiddleware': 544,
}
ITEM_PIPELINES = {
'scrapydownloader.pipelines.ImgsPipLine': 300,
}
IMAGES_STORE = './scrapydownloaderimg' #文件保存路径