爬取汽车之家北京地区汽车详细数据

本文旨在交流学习,勿作他用,否则后果自负
环境 win+pycharm+anaconda

import re
import csv
import requests
from lxml import etree
from user_agent import UA
head = {
    'User-Agent': UA,
    'X-Requested-With': 'XMLHttpRequest',
    'Referer': '*/*',
    'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4',
    'Accept': 'application/json, text/javascript, */*; q=0.01',
}
base_url='https://car.autohome.com.cn/AsLeftMenu/As_LeftListNew.ashx?typeId=1%20&brandId=0%20&fctId=0%20&seriesId=0'
r=requests.get(url=base_url,headers=head,timeout=30)
data=etree.HTML(r.text)
title_list=data.xpath(r".//ul/li/h3/a/text()")
url_list=data.xpath(r".//ul/li/h3/a/@href")
id_list=data.xpath(r".//ul/li/@id")
for i,j,k in zip(title_list,url_list,id_list):
    j='https://car.autohome.com.cn'+j
    k=re.compile(r'b(.*)').findall(k)
    list_url='https://car.autohome.com.cn/AsLeftMenu/As_LeftListNew.ashx?typeId=1&brandId={}&fctId=0&seriesId=0'.format(k[0])
    n_r = requests.get(url=list_url, headers=head, timeout=30)
    n_data = etree.HTML(n_r.text)
    title=n_data.xpath(r".//dl/dd/a/text()")
    url=n_data.xpath(r'.//dl/dd/a/@href')
    for t,u in zip(title,url):
        u='https://car.autohome.com.cn'+u
        car_d = requests.get(url=u, headers=head, timeout=30)
        # d = bytes(car_d.text, car_d.encoding).decode('GBK', 'ignore')
        car_data = etree.HTML(car_d.text)
        car_score = " ".join(car_data.xpath(r'.//a[@class="font-bold"]/div[@class="score-cont"]/text()|.//a[@class="font-bold"]/div[@class="score-cont"]//span[@class="score-number"]/text()'))#评分
        car_level = " ".join(car_data.xpath(r'.//ul[@class="lever-ul"]/li/span[@class="info-gray"]/text()'))#级别
        car_stru = " ".join(car_data.xpath(r'.//ul[@class="lever-ul"]/li[2]/a/text()|.//ul[@class="lever-ul"]/li[2]/span/text()'))#结构或续航
        car_engine = " ".join(car_data.xpath(r'.//ul[@class="lever-ul"]/li[3]//a/text()|.//ul[@class="lever-ul"]/li[3]/span/text()'))#发动机
        car_tran_case = " ".join(car_data.xpath(r'.//ul[@class="lever-ul"]/li[4]//a/text()|.//ul[@class="lever-ul"]/li[4]/span/text()'))#变速箱或充电时间
        car_colour = " ".join(car_data.xpath(r'.//ul[@class="lever-ul"]/li[5]//div[@class="tip-content"]/text()'))#颜色
        car_price=" ".join(car_data.xpath(r'.//div/span[@class="lever-price red"]/span[@class="font-arial"]/text()'))#指导价
        car_img='https:'+"".join(car_data.xpath(r'.//div[@class="list-cont-img"]//img/@src'))#图片

        with open('car.csv', 'a+',encoding="utf-8") as f:
            f_csv = csv.writer(f)
            f_csv.writerow([t,car_score,car_level,car_stru,car_engine,car_tran_case,car_colour,car_price,car_img])

        print(t,car_score,car_level,car_stru,car_engine,car_tran_case,car_colour,car_price,car_img)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值