新建一个scrapy工程
(python35) ubuntu@ubuntu:~/scrapy_project$ scrapy startproject huaban
添加一个spider
(python35) ubuntu@ubuntu:~/scrapy_project/huaban/huaban/spiders$ scrapy genspider huaban_pets huaban.com
目录结构如下:
(python35) ubuntu@ubuntu:~/scrapy_project/huaban$ tree -I *.pyc
.
├── huaban
│ ├── __init__.py
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── __pycache__
│ ├── settings.py
│ └── spiders
│ ├── huaban_pets.py
│ ├── __init__.py
│ └── __pycache__
└── scrapy.cfg
编辑items.py文件
# -*- coding: utf-8 -*-
import scrapy
class HuabanItem(scrapy.Item):
img_url = scrapy.Field()
编辑huaban_pets.py
# -*- coding: utf-8 -*-
import scrapy
class HuabanPetsSpider(scrapy.Spider):
name = 'huaban_pets'
allowed_domains = ['huaban.com']
start_urls = ['http://huaban.com/favorite/pets/']
def parse(self, response):
for img_src in response.xpath('//*[@id="waterfall"]/div/a/img/@src').extract():
item = HuabanmeinvItem()
# 例如img_src为//img.hb.aicdn.com/223816b7fee96e892d20932931b15f4c2f8d19b315735-wgi1w2_fw236
# 去掉后面的_fw236就为原图
item['img_url'] = 'http:' + img_src[:-6]
yield item
编写一个中间键使用phantomj获取网页源码
在middlewares.py添加如下内容:
# -*- coding: utf-8 -*-
from scrapy import signals
from selenium import webdriver
from scrapy.http import HtmlResponse
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
class JSPageMiddleware(object):
def process_request(self, request, spider):
if spider.name == 'hbmeinv':
# cap[".page.setting.resourceTimeout"] = 180
# cap["chrome.page.setting.loadImage"] = False
dcap = dict(DesiredCapabilities.PHANTOMJS)
# 不载入图片,爬页面速度会快很多
dcap["phantomjs.page.settings.loadImages"] = False
browser = webdriver.PhantomJS(executable_path=r'/home/ubuntu/scrapy_project/huabanphantomjs',desired_capabilities=dcap)
try:
browser.get(request.url)
return HtmlResponse(url=browser.current_url, body=browser.page_source,encoding='utf-8',request=request)
except:
print("get page failed!")
finally:
browser.quit()
el return
在pipelines.py中添加如下内容下载网页图片:
# -*- coding: utf-8 -*-
import urllib
class HuabanmeinvPipeline(object):
def process_item(self, item, spider):
url = item['img_url']
urllib.request.urlretrieve(url, filename=r'/home/ubuntu/scrapy_project/huaban/image/%s.jpg' % url[url.rfind('/')+1:])
return item
在setting.py中使用添加的中间键和设置消息头
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN',
'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
#'huabanmeinv.middlewares.MyCustomDownloaderMiddleware': 543,
'huabanmeinv.middlewares.JSPageMiddleware': 543,
}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'huabanmeinv.pipelines.HuabanmeinvPipeline': 300,
}
开始爬取
ubuntu@ubuntu:~/scrapy_project/huaban/huaban/spiders$ scrapy runspider huaban_pets.py
爬取结束后,就可以在/home/ubuntu/scrapy_project/huaban/image目录下看到爬取的图片了,例如: