爬取易车RAV4的数据
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import json
class YichespiderPipeline(object):
def __init__(self):
# 初始化要写入的JSON
self.json_file = open('carSpider.json', 'wb+')
self.json_file.write('\n'.encode('UTF-8'))
# 该方法的item就是蜘蛛yield的item对象
def process_item(self, item, spider):
text = json.dumps(dict(item), ensure_ascii=False) + ',\n'
self.json_file.write(text.encode("UTF-8"))
# print('款式:', item['design'])
# print('购买时间:', item['buy_date'])
# print('地址:', item['buy_addr'])
# print('裸车价:', item['real_price'])
# print('指导价:', item['original_price'])
def close_spider(self, spider):
print('-------------------------关闭爬虫-------------------------------------')
self.json_file.seek(-2, 1)
self.json_file.write('\n'.encode("UTF-8"))
self.json_file.close()
# -*- coding: utf-8 -*-
import scrapy
from YicheSpider.items import YichespiderItem
class CarSpiderSpider(scrapy.Spider):
name = 'car_spider'
allowed_domains = ['luochejia.yiche.com']
# 从哪个页面开始爬
# urls = ['http://luochejia.yiche.com/yiqifengtianrav4/price/?page=%s' % i for i in range(1, 62)]
# start_urls = urls
start_urls = ['http://luochejia.yiche.com/yiqifengtianrav4/price/?page=1']
def parse(self, response):
# 每个job_primary元素包含一个工作信息
for car_primary in response.xpath('//div[@class="price-list-box"]'):
item = YichespiderItem()
# 款式
item['design'] = car_primary.xpath('./div[@class="con-box"]/div[@class="tit"]/text()').extract_first()
# 购买时间
item['buy_date'] = car_primary.xpath('./div[@class="con-box"]/p[@class="other"]/text()').extract_first()
# 地址
item['buy_addr'] = car_primary.xpath('./div[@class="con-box"]/p[@class="other"]/text()').extract_first()
# 裸车价
item['real_price'] = car_primary.xpath(
'./div[@class="con-box"]/div[@class="price"]/p[@class="luochejia"]/em/text()').extract_first()
# 指导价
item['original_price'] = car_primary.xpath(
'./div[@class="con-box"]/div[@class="price"]/p[@class="zhidaojia"]/text()').extract_first()
yield item
next_page = response.xpath('//div[@class="pagination mbt20"]/div/a[@class="next-on"]/@href').extract()
# 增加判断
countNum = 0
# 第一页往后一个
if next_page and len(next_page) > 1:
new_link = next_page[1]
print(
'下一页地址:http://luochejia.yiche.com' + new_link + '+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
yield scrapy.Request('http://luochejia.yiche.com' + new_link, callback=self.parse)
elif next_page and len(next_page) > 0 and countNum == 0:
new_link = next_page[0]
print(
"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++当前是第一页+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
print(
'下一页地址:http://luochejia.yiche.com' + new_link + '+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
yield scrapy.Request('http://luochejia.yiche.com' + new_link, callback=self.parse)
else:
pass