- 踩点:通过抓包工具,通过点击下一页观察地址栏的变化
以抓取校花网为例
http://www.xiaohuar.com/list-1-1.html
当点击下一页时变成
http://www.xiaohuar.com/list-1-2.html
还需观察其结束的位置
http://www.xiaohuar.com/list-1-43.html
# -*- coding: utf-8 -*-
import scrapy
import os
from pic.items import PicItem
class XhSpider(scrapy.Spider):
name = 'xh'
allowed_domains = ['http://www.xiaohuar.com']
url = "http://www.xiaohuar.com/list-1-"
end = ".html"
offset = 0 #通过定义offset变量来改变地址从而实现多页爬取
start_urls = [url + str(offset)+end]
在编写def parse(self, response)函数时要进行循环这里写代码片这里写代码片
if self.offset < 44:
self.offset += 1
yield scrapy.Request(self.url + str(self.offset)+self.end, callback=self.parse,dont_filter=True)
对于scrapy.Request函数有三个参数(url,callback=xxx,dont_filter=xxx)
url:资源地址
callback:回滚的位置
dont_filter:是你要request的地址和allow_domain里面的冲突,
从而被过滤掉。可以停用过滤功能。如果不冲突,可以不写
项目代码如下
XhSpider
# -*- coding: utf-8 -*-
import scrapy
import os
from pic.items import PicItem
class XhSpider(scrapy.Spider):
name = 'xh'
allowed_domains = ['http://www.xiaohuar.com']
url = "http://www.xiaohuar.com/list-1-"
end = ".html"
offset = 0
start_urls = [url + str(offset)+end]
def parse(self, response):
allPics = response.xpath('//div[@class="img"]/a')
for pic in allPics:
# 分别处理每个图片,取出名称及地址
item = PicItem()
item['name'] = pic.xpath('./img/@alt').extract()[0]
addr = pic.xpath('./img/@src').extract()[0]
addr = 'http://www.xiaohuar.com' + addr
item['addr'] = addr
# 返回爬取到的数据
yield item
if self.offset < 44:
self.offset += 1
yield scrapy.Request(self.url + str(self.offset)+self.end, callback=self.parse,dont_filter=True)
Item
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class PicItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
addr = scrapy.Field()
name = scrapy.Field()
pipelines
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import urllib.request
import os
class PicPipeline(object):
def process_item(self, item, spider):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0'}
req = urllib.request.Request(url=item['addr'], headers=headers)
res = urllib.request.urlopen(req)
file_name = os.path.join(r'D:\MyDownloads\Download\tu', item['name'] + '.jpg')
with open(file_name, 'wb') as fp:
fp.write(res.read())