scrapy爬取易车网100+款汽车详细信息

spider.py编码

import scrapy
import re

class QicheSpider(scrapy.Spider):
    name = 'qiche'
    allowed_domains = ['car.bitauto.com']
    start_urls = ['http://car.bitauto.com/']

    def parse(self, response):
        rank_price_list_div = response.xpath("//div[@class='rank-list-price-item']")
        for div in rank_price_list_div:
            price = {"price":div.xpath(".//div[@class='price-item-header']/text()").extract_first()}
            car_urls = div.xpath(".//div[@class='cx-item']/a/@href").extract()
            car_urls = ["http://car.bitauto.com"+c_u for c_u in car_urls]
            for c_u in car_urls:
                yield scrapy.Request(url=c_u, callback=self.parse_car, meta=price)


    def parse_car(self, response):
        item = {}
        item["price_range"] = response.meta["price"]
        item["car_name"] = response.xpath("//h1/span/em/text()").extract_first()
        item["car_logo_img_src"] = response.xpath("//h1/img/@src").extract_first()
        item["guide_price"] = response.xpath("//span[@class='guide-price']/text()").extract_first().strip()
        item["car_big_img_src"] = "https:" + response.xpath("//div[@class='big-img']//img/@src").extract_first()
        item["all_style"] = []
        all_strle_trs = response.xpath("//tr[@class='list-info']")
        for tr in all_strle_trs:
            style = {}
            style["name"] = tr.xpath(".//td[@class='first']/a/text()").extract_first().strip()
            style["price"] = tr.xpath(".//td[@class='five']/text()").extract_first().strip()
            item["all_style"].append(style)
        item["hot_comment"] = []
        all_hot_comment = response.xpath("//div[@class='cm-content-moudle']")
        for hot_comment in all_hot_comment:
            comment = {}
            comment["user_name"] = hot_comment.xpath(".//p[@class='cm-user-name']/text()").extract_first().strip()
            comment["nacked_car_pre"] = hot_comment.xpath(".//span[@class='cm-car-price-value']/text()").extract_first().strip()
            comment["purchase_time"] = hot_comment.xpath(".//span[@class='cm-car-buy-time-value']/text()").extract_first().strip()
            comment["text"] = re.sub(r"\s", "", "".join(hot_comment.xpath(".//div[@class='cm-content']//text()").extract()))
            comment["img_src"] = ["http:" + src for src in hot_comment.xpath(".//div[@class='img-item']/img/@data-original").extract()]
            item["hot_comment"].append(comment)
        print(item)
        yield item

pipelines.py编码

import pymongo

class P11YichePipeline(object):

    def __init__(self):
        self.mongo_client = pymongo.MongoClient()
        self.save_path = self.mongo_client["cars_info"]["data"]

    def process_item(self, item, spider):
        self.save_path.insert_one(item)
        return item

    def close_spider(self):
        self.mongo_client.close()

setting.py设置

LOG_LEVEL = "ERROR",
#开启下载延迟,防止抓取过快被封
DOWNLOAD_DELAY = 3,
#设置默认请求头
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
},
#开启管道
ITEM_PIPELINES = {
   'p11_yiche.pipelines.P11YichePipeline': 300,
}

爬取结果如下:
在这里插入图片描述

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值