# -*- coding: utf-8 -*-
import scrapy
from car_spi.items import CarSpiItem
class CarSpider(scrapy.Spider):
name = 'car'
allowed_domains = ['"car.autohome.com.cn"']
start_urls = ['https://car.autohome.com.cn/pic/series/5146.html#pvareaid=2042214']
def parse(self, response):
ui_boxs = response.xpath('//div[@class="uibox"]')
for ui_box in ui_boxs:
title = ui_box.xpath('.//div[@class="uibox-title"]/a/text()').get()
urls = ui_box.xpath('.//ul/li/a/img/@src').getall()
urls=list(map(lambda url:response.urljoin(url),urls)) # 把列表中元素遍历交给拉姆达表达式,返回的是map对象
item = CarSpiItem(title=title,urls =urls)
yield item
pipelines:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import os
from urllib import request
class CarSpiPipeline(object):
def __init__(self):
self.path = os.path.join(os.path.dirname(os.path.dirname(__file__)),'images')
if not os.path.exists(self.path):
os.makedirs(self.path)
def process_item(self, item, spider):
title = item['title']
urls = item['urls']
title_path = os.path.join(self.path,title)
if not os.path.exists(title_path):
os.makedirs(title_path)
for url in urls:
image_name = url.split('_')[-1]
request.urlretrieve(url,os.path.join(title_path,image_name))
return item
items:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class CarSpiItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
urls = scrapy.Field()
settings:
# -*- coding: utf-8 -*-
# Scrapy settings for car_spi project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
imp