用scrapy爬取天堂图片库:
文件目录
items.py文件:定义字段
import scrapy
class ScraPictureItem(scrapy.Item):
pic_url = scrapy.Field()
spiders/pic.py:函数的主程序
# -*- coding: utf-8 -*-
import scrapy
from Scra_Picture.items import ScraPictureItem
class PicSpider(scrapy.Spider):
name = 'PIC'
# start_urls = ['https://www.ivsky.com/tupian/ziranfengguang/index_1.html']
def start_requests(self):
for i in range(1, 10):
page_url = 'https://www.ivsky.com/tupian/ziranfengguang/index_{}.html'.format(i)
yield scrapy.Request(url=page_url, callback=self.parse)
def parse(self, response):
items = response.xpath('/html/body/div[3]/div[2]/ul/li')
# print(items)
for item in items:
base_url = 'https://www.ivsky.com'
html_url = item.xpath('./div/a/@href').extract_first()
# print(base_url + html_url)
url = base_url + html_url
yield scrapy.Request(url=url, callback=self.parse_item)
def parse_item(self, response):
selector = response.xpath('/html/body/div[3]/div[4]/ul/li')
# print(selector)
for li_item in selector:
item = ScraPictureItem()
li_url = li_item.xpath('./div/a/img/@src').extract_first()
# print(li_url)
if 'http:' not in li_url:
pic_url = 'http:' + li_url
else:
pic_url = li_url
# print(pic_url)
item['pic_url'] = pic_url
yield item
settings.py文件:对项目进行配置
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 ' \
'(KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3676.400 QQBrowser/10.4.3469.400'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
COOKIES_ENABLED = False
ITEM_PIPELINES = {
'Scra_Picture.pipelines.ScraPicturePipeline': 300,
}
IMAGES_STORE = "D:/爬虫/scra_picture/JPG/"
管道下载piplines.py文件:必须继承图片类
from scrapy.pipelines.images import ImagesPipeline
import scrapy
from scrapy.pipelines.images import ImagesPipeline
class ScraPicturePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
image_url = item['pic_url']
yield scrapy.Request(image_url)
为避免在在命令行里运行程序
在项目下创建start.py文件启动项目:
from scrapy import cmdline
cmdline.execute("scrapy crawl PIC".split())
爬取结果:
#如果网站更新,程序或不可用,因为爬虫具有一定的时效性