setting設置
ITEM_PIPELINES = {
'girls.pipelines.GirlsPipeline': 300,
'scrapy.pipelines.images.ImagesPipeline': None
}
IMAGES_STORE = r'F:\myScrapy\girls\girls\spiders\img
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
"User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36'
}
spider
选择器我用的是css 选择器
# -*- coding: utf-8 -*-
import scrapy
from ..items import *
class GetgirlSpider(scrapy.Spider):
name = 'getgirl'
allowed_domains = ['2717.com']
start_urls = ['https://www.2717.com/tag/1756.html']
def parse(self, response):
item = GirlsItem()
hrefs = response.css("div.TagPage li>a::attr(href)").getall()
hrefs_pic = response.css("ul.w110.oh.Tag_list li>a::attr(href)").getall()
for href_pic in hrefs_pic:
yield response.follow(href_pic,self.get_href)
for href in hrefs:
yield response.follow(href,self.parse)
def get_href(self,response):
hrefs = response.css("ul.articleV4Page.l li>a::attr(href)").getall()
for href in hrefs:
if ".html" in href:
yield response.follow(href,self.get_pic)
def get_pic(self,response):
items = GirlsItem()
src = response.css("p[align=center] img::attr(src)").get()
title = response.css("p[align=center] img::attr(alt)").get()
items['src'] = src
items['title'] = title
yield items
items
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class GirlsItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
src = scrapy.Field()
title = scrapy.Field()
img_path = scrapy.Field()
pipelines 没用scrapy框架默认的管道
import scrapy
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
import time
import urllib
import os
class GirlsPipeline(object):
def process_item(self, item, spider):
time_now = time.time()
path = "F:/myScrapy/girls/girls/spiders/PIC/"
filename = path + str(time_now) + ".jpg"
with open(filename,"wb") as f:
req = urllib.request.urlopen(item['src'])
f.write(req.read())
return item