Python爬虫实战记录（汽车之家实战）

万事洛必达

已于 2023-05-06 20:31:44 修改

阅读量1.5k

点赞数 1

文章标签： python 爬虫汽车

于 2023-05-06 20:27:04 首次发布

本文链接：https://blog.csdn.net/m0_59862058/article/details/130534922

版权

Python爬虫实战记录（汽车之家实战）

一、准备工作

安装所需包
```
pip install lxml
pip selenium
```
准备Chrome驱动

下载对应的Chrome驱动：chromedriver.storage.googleapis.com/index.html

二、相关代码

start.py

import io
import time

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from load_car_brand import parse_brand
from load_car_series import parse_series
from lxml import etree
import constans
from constans import isCarErrorDate

browser = None
url = "http://www.che168.com/changsha/list/#pvareaid=100945"

brands = []


def parse(page_html, series):
    cars = []
    selector = etree.HTML(page_html)
    lis = selector.xpath("/html/body/div[@class='content fn-clear card-wrap']/div["
                         "@id='goodStartSolrQuotePriceCore0']/ul[@class='viewlist_ul']/li")

    for node in lis:
        if isCarErrorDate(node):
            break
        cars.append({
            "name": node.xpath("./a[@class='carinfo']/div[@class='cards-bottom']/h4[@class='card-name']/text()")[0],

            "price": node.xpath("./a[@class='carinfo']/div[@class='cards-bottom']/div[@class='cards-price-box']/span["
                                "@class='pirce']/em/text()")[0],

            "src": node.xpath("./a[@class='carinfo']/div[@class='img-box ']/img/@src")[0]
        })
    series['carList'] = cars


def load_car_date():
    global browser

    for brand in brands:
        if brand['seriesList'] is not None:
            for series in brand['seriesList']:
                # 获取车辆数据
                browser.get(constans.host + series['url'])

                # 解决图片懒加载问题：滑动页面解决
                for i in range(10):
                    browser.execute_script(f'document.documentElement.scrollTop={(i + 1) * 1000}')
                    time.sleep(1)

                page_html = browser.page_source
                parse(page_html, series)


# 数据写入json
def write_to_file():
    data_str = str(brands)
    data_str.replace("'", "\"")

    print("end")
    print(data_str)

    with io.open("data.json", "w", encoding="utf-8") as f:
        f.write(data_str)


def start():
    global browser, brands

    options = webdriver.ChromeOptions()
    options.add_experimental_option("detach", True)
    options.add_argument("--start-maximized")

    path = Service("chromedriver.exe")
    browser = webdriver.Chrome(service=path, options=options)

    browser.get(url)
    page_html = browser.page_source
    brands = parse_brand(browser, page_html)
    parse_series(browser, brands)
    load_car_date()
    write_to_file()


if __name__ == '__main__':
    start()

load_car_brand.py

from lxml import etree
from selenium import webdriver
from constans import isErrorDateByA

brands = []


# 解析品牌信息
def parse_brand(browser: webdriver, page_html: str):
    global brands

    selector = etree.HTML(page_html)
    brandsNode = selector.xpath("/html/body/div[@class='list-screening-wrap content']/div[@id='listfilterstart']/div["
                                "@class='condition-list condition-brand fn-clear js-screening-up'][1]/div["
                                "@class='screening-margin fn-clear']/div[@class='screening-base fn-clear']/a")

    print("品牌:")
    for node in brandsNode:
        if isErrorDateByA(node):
            continue
        brand = {'url': node.xpath("./@href")[0], 'name': node.xpath('./text()')[0]}
        brands.append(brand)
        print(node.xpath('./text()')[0])

    print("---品牌数据获取完毕---")

    return brands

load_car_series.py

import constans
from lxml import etree
from constans import isErrorDateByA


def parse_series(browser, brands):
    print("车系:")
    for brand in brands:
        browser.get(constans.host + brand['url'])
        page_html = browser.page_source
        series = parse(page_html)
        brand['seriesList'] = series

    print("---车系数据获取完毕---")


def parse(page_html):
    series = []
    selector = etree.HTML(page_html)
    seriesNodes = selector.xpath("/html/body/div[@class='list-screening-wrap content']/div[@id='listfilterstart']/div["
                                 "@class='condition-list condition-series fn-clear js-screening-up']/div["
                                 "@class='screening-margin "
                                 "fn-clear']/div[@class='screening-base fn-clear']/a")

    for node in seriesNodes:
        # 车系数据
        if isErrorDateByA(node):
            continue
        series.append({
            "name": node.xpath("./text()")[0],
            "url": node.xpath("./@href")[0]
        })
        print(node.xpath("./text()")[0])

    return series

    # return series

constans.py

host = "https://www.che168.com"


# 品牌数据校验
def isErrorDateByA(bror):
    url = bror.xpath("./@href")
    name = bror.xpath('./text()')

    if url is None or len(url) == 0 or name is None or len(name) == 0:
        return True

    return False


def isCarErrorDate(node):
    name = node.xpath("./a[@class='carinfo']/div[@class='cards-bottom']/h4[@class='card-name']/text()")

    price = node.xpath("./a[@class='carinfo']/div[@class='cards-bottom']/div[@class='cards-price-box']/span["
                       "@class='pirce']/em/text()")

    src = node.xpath("./a[@class='carinfo']/div[@class='img-box ']/img/@src")

    if name is None or len(name) == 0 or price is None or len(price) == 0 or src is None or len(src) == 0:
        return True

    return False