首先打开链接:https://www.autohome.com.cn/202/#levelsource=000000000_0&pvareaid=101594点击图片实拍:
然后新建一个scrapy项目,接下来就进入项目里面写代码了。
bmw5.py:
# -*- coding: utf-8 -*-
import scrapy
from bmw.items import BmwItem
class Bmw5Spider(scrapy.Spider):
name = "bmw5"
allowed_domains = ["car.autohome.com.cn"]
start_urls = ['https://car.autohome.com.cn/pic/series/202.html#pvareaid=3454438']
def parse(self, response):
uiboxs = response.xpath('//div[@class="uibox"]')[1:]
for uibox in uiboxs:
category = uibox.xpath('.//div[@class="uibox-title"]/a/text()').get()
urls = uibox.xpath('.//ul/li/a/img/@src').getall()
# for url in urls:
# url = response.urljoin(url)#将获取到的url中缺少的https:自动添加上
# print(url)
urls = list(map(lambda url:response.urljoin(url),urls))#遍历这个列表,将列表中的每一项都执行同一个方法,然后将返回值作为一个新的列表urls返回,是一个map对象
item = BmwItem(category=category,urls=urls)
yield item
piplines.py:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import os
from urllib import request
class BmwPipeline(object):
def __init__(self):
self.path = os.path.join(os.path.dirname(os.path.dirname(__file__)),'images')#进入当前目录的上一级目录,然后创建一个目录images并将路径拼接起来
if not os.path.exists(self.path):
os.mkdir(self.path)
def process_item(self, item, spider):
category = item['category']
urls = item['urls']
category_path = os.path.join(self.path,category)
if not os.path.exists(category_path):
os.mkdir(category_path)
for url in urls:
image_name = url.split('_')[-1]
request.urlretrieve(url,os.path.join(category_path,image_name))
return item
items.py:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class BmwItem(scrapy.Item):
category = scrapy.Field()
urls = scrapy.Field()
但是这些代码只是将缩略图下载下来,并没有将高清图下载下来,我们的目的是要下载高清图,而且数量也不够。要拿到高清图,首先就需要分析缩略图和高清图的url的区别。
from scrapy.spider import CrawlSpider,Rule
from scrapy.linkextractor import LinkExtractor
class Bmw5Spider(CrawlSpider):
name = "bmw5"
allowed_domains = ["car.autohome.com.cn"]
start_urls = ['https://car.autohome.com.cn/pic/series/202.html#pvareaid=3454438']
rules = (
Rule(LinkExtractor(allow='https://car.autohome.come.cn/pic/series/65.+'),callback='parse_page',follow=True),
)
def parse_page(self, response):
category = response.xpath('//div[@class="uibox"]/div/text()').get()
src = response.xpath('//div[contains(@class,"uibox-con")]/ul/li//img/@src').getall()#因为有很多属性,因此用contains选择第一个属性
#将每一个url的t_去掉
srcs = list(map(lambda x:x.replace("t_",""),src))