items.py
import scrapy
class SoItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
img_url = scrapy.Field()
so.py
# -*- coding: utf-8 -*-
import scrapy
import json
from ..items import SoItem
class SoSpider(scrapy.Spider):
name = 'so'
allowed_domains = ['image.so.com']
# start_urls = ['http://image.so.com/']
url = 'http://image.so.com/zjl?ch=beauty&sn={}&listtype=new&temp=1'
def start_requests(self):
for sn in range(0, 91, 30):
full_url = self.url.format(sn)
#
yield scrapy.Request(
url=full_url,
callback=self.parse_image
)
def parse_image(self, response):
html = json.loads(response.text)
item = SoItem()
for img_dict in html['list']:
item['img_url'] = img_dict['qhimg_url']
yield item
pipelines.py
from scrapy.pipelines.images import ImagesPipeline
import scrapy
class SoPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
yield scrapy.Request(
url=item['img_url']
)
run.py
from scrapy import cmdline
cmdline.execute('scrapy crawl so'.split())