基于Scrapy 爬取易车品牌 车系 车款(在售 停售)及车辆参数
源码地址:https://download.csdn.net/download/wpaycn/11548531
# -*- coding: utf-8 -*-
import scrapy
import re
import json
import logging
from copy import deepcopy
from yicar.items import YicarItem
logger = logging.getLogger(__name__)
# json替换key
def replacea(matched):
return '\"' + matched.group('value') + '\":'
class YcSpider(scrapy.Spider):
name = 'yc'
allowed_domains = ['bitauto.com']
start_urls = [
'http://api.car.bitauto.com/CarInfo/getlefttreejson.ashx?tagtype=chexing&pagetype=masterbrand&objid=0']
def parse(self, response):
result = re.sub('(?P<value>\w+):', replacea,
response.text[response.text.find('{'):response.text.rfind('}') + 1].replace('https:', ''))
data = json.loads(result)
for char in data['char']:
try:
for brand in data['brand']['%s' % char]:
item = {}
item["params"] = {}
item['id'] = brand['id']
item['name'] = brand['name']
item['initial'] = char
url = 'http://car.bitauto.com/tree_chexing/mb_{}/'.format(item['id'])
item['url'] = url
yield scrapy.Request(
url,
callback=self.parse_serial,
meta={"item": deepcopy(item)}
)
return
except KeyError:
pass
def parse_serial(self, response):
item = response.meta.get("item")
brands_div = response.xpath("//div[@id='divCsLevel_0']")
car_cates = brands_div.xpath('./h5')
# 循环遍历得到子品牌
brand_item_id = 0
for i in car_cates:
brand_item_id = brand_item_id + 1
brand_cate = brands_div.xpath('./h5[' + str(brand_item_id) + ']/a/text()').extract_first()
# logger.warning(brand_cate)
item["brand_item"] = brand_cate
brand_item = brands_div.xpath("./div[" + str(brand_item_id) + "]")
brand_item_col = brand_item.xpath("./div[@class='col-xs-3']")
for j in brand_item_col:
brand_name = j.xpath("./div/ul/li[contains(@class,'name')]/a/text()").extract_first()
brand_price = j.xpath("./div/ul/li[@class='price']/a/text()").extract_first()
brand_url = j.xpath("./div/ul/li[contains(@class,'name')]/a/@href").extract_first()
brand_url = "http://car.bitauto.com" + str(brand_url)
# logger.warning(str(brand_name) + "---" + str(brand_price) + "---" + brand_url)
item["model"] = str(brand_name)
item["price"] = str(brand_price)
if str(brand_price) == "未上市":
pass
else:
logger.warning("##### " + item["model"] + " #######")
yield scrapy.Request(
brand_url,
callback=self.parse_vehicle,
meta={"item": deepcopy(item)}
)
return
# 获取在售车款
def parse_vehicle(self, response):
item = response.meta.get("item")
# 在售车款
tr_list = response.xpath("//tr[contains(@id,'car_filter_id')]/td[1]")
for tr in tr_list:
vehicle_name = tr.xpath("./a/text()").extract_first()
vehicle_url = tr.xpath("./a/@href").extract_first()
vehicle_url = "http://car.bitauto.com" + str(vehicle_url)
logger.warning(vehicle_name + " --- " + vehicle_url)
# logger.warning(item["model"] + "--" + str(vehicle_name) + "---" + str(vehicle_url))
item["vehicle"] = vehicle_name
item["issell"] = "在售"
yield scrapy.Request(
vehicle_url,
callback=self.parse_vehicle_params,
meta={"item": deepcopy(item)}
)
# 停售车款
old_vehicle_url_id = response.xpath("//input[@id='csHid']/@value").extract_first()
# 得到停售车辆年份
drop_a = response.xpath("//div[@class='drop-layer']/a/text()").extract()
for i in drop_a:
brand_url = "http://car.bitauto.com/AjaxNew/GetNoSaleSerailListByYear.ashx?csID=" + str(
str(old_vehicle_url_id)) + "&year=" + str(i)[0:4]
# logger.warning(brand_url)
yield scrapy.Request(
brand_url,
callback=self.parse_old_vehicle,
meta={"item": deepcopy(item)}
)
# 解析非在售车款
def parse_old_vehicle(self, response):
item_model = response.meta.get("item")
data = json.loads(response.body.decode())
for item in data:
for itemj in item["carList"]:
vehicle_name = itemj["Name"]
vehicle_id = itemj["CarID"]
vehicle_year_type = itemj["YearType"]
vehicle_spell = itemj["Spell"]
brand_url = "http://car.bitauto.com/" + str(vehicle_spell) + "/m" + str(vehicle_id)
item_model["vehicle"] = str(vehicle_year_type) + " " + str(vehicle_name)
item_model["issell"] = "停产"
logger.warning(item_model["vehicle"] + " *** " + brand_url)
yield scrapy.Request(
brand_url,
callback=self.parse_vehicle_params,
meta={"item": deepcopy(item_model)}
)
# 解析车辆参数
def parse_vehicle_params(self, response):
item_model = response.meta.get("item")
# 存放参数
params = []
item_model["params"] = params
cate = response.xpath("//div[@class='caption-1']")
cate_layout = response.xpath("//div[@class='special-layout-18 layout-1']")
cate_item_index = 0
for i in cate_layout:
param = {}
item_model["params"].append(param)
cate_ll = cate[cate_item_index]
big_cate = cate_ll.xpath("./h6/text()").extract_first()
# logger.warning(str(big_cate))
param["param_cate"] = str(big_cate)
# 获取所有的参数
pars_tr = i.xpath("./table/tbody/tr")
# 获取到所有的参数
index = 0
param["param_cate_value"] = []
# 每一行
for j in pars_tr:
pars_td = j.xpath("./td")
for k in pars_td:
if index % 2 == 0:
param_item = {}
p = k.xpath("./span/text()").extract_first()
param_item["key"] = p
else:
# 颜色
if param_item["key"] == "外观颜色:":
a_list = k.xpath("./div/ul/li/a/@title").extract()
param_item["value"] = a_list
# 可选配置 判断
elif len(k.xpath("./div")) != 0:
p_list = []
div_list = k.xpath("./div/div/div[@class='l']")
for xi in div_list:
p_obj = {}
cricle = xi.xpath("./i/text()").extract_first()
text = xi.xpath("./text()").extract_first()
p_obj["isSelect"] = cricle
p_obj["value"] = text
p_list.append(p_obj)
param_item["value"] = p_list
# 字
else:
p = k.xpath("./span/text()").extract_first()
param_item["value"] = p
param["param_cate_value"].append(param_item)
index = index + 1
cate_item_index = cate_item_index + 1
logger.warning(json.dumps(item_model, ensure_ascii=False))
# yield item_model