Python爬虫实战记录(汽车之家实战)
一、准备工作
-
安装所需包
pip install lxml pip selenium
-
准备Chrome驱动
下载对应的Chrome驱动:chromedriver.storage.googleapis.com/index.html
二、相关代码
-
start.py
import io import time from selenium import webdriver from selenium.webdriver.chrome.service import Service from load_car_brand import parse_brand from load_car_series import parse_series from lxml import etree import constans from constans import isCarErrorDate browser = None url = "http://www.che168.com/changsha/list/#pvareaid=100945" brands = [] def parse(page_html, series): cars = [] selector = etree.HTML(page_html) lis = selector.xpath("/html/body/div[@class='content fn-clear card-wrap']/div[" "@id='goodStartSolrQuotePriceCore0']/ul[@class='viewlist_ul']/li") for node in lis: if isCarErrorDate(node): break cars.append({ "name": node.xpath("./a[@class='carinfo']/div[@class='cards-bottom']/h4[@class='card-name']/text()")[0], "price": node.xpath("./a[@class='carinfo']/div[@class='cards-bottom']/div[@class='cards-price-box']/span[" "@class='pirce']/em/text()")[0], "src": node.xpath("./a[@class='carinfo']/div[@class='img-box ']/img/@src")[0] }) series['carList'] = cars def load_car_date(): global browser for brand in brands: if brand['seriesList'] is not None: for series in brand['seriesList']: # 获取车辆数据 browser.get(constans.host + series['url']) # 解决图片懒加载问题:滑动页面解决 for i in range(10): browser.execute_script(f'document.documentElement.scrollTop={(i + 1) * 1000}') time.sleep(1) page_html = browser.page_source parse(page_html, series) # 数据写入json def write_to_file(): data_str = str(brands) data_str.replace("'", "\"") print("end") print(data_str) with io.open("data.json", "w", encoding="utf-8") as f: f.write(data_str) def start(): global browser, brands options = webdriver.ChromeOptions() options.add_experimental_option("detach", True) options.add_argument("--start-maximized") path = Service("chromedriver.exe") browser = webdriver.Chrome(service=path, options=options) browser.get(url) page_html = browser.page_source brands = parse_brand(browser, page_html) parse_series(browser, brands) load_car_date() write_to_file() if __name__ == '__main__': start()
-
load_car_brand.py
from lxml import etree from selenium import webdriver from constans import isErrorDateByA brands = [] # 解析品牌信息 def parse_brand(browser: webdriver, page_html: str): global brands selector = etree.HTML(page_html) brandsNode = selector.xpath("/html/body/div[@class='list-screening-wrap content']/div[@id='listfilterstart']/div[" "@class='condition-list condition-brand fn-clear js-screening-up'][1]/div[" "@class='screening-margin fn-clear']/div[@class='screening-base fn-clear']/a") print("品牌:") for node in brandsNode: if isErrorDateByA(node): continue brand = {'url': node.xpath("./@href")[0], 'name': node.xpath('./text()')[0]} brands.append(brand) print(node.xpath('./text()')[0]) print("---品牌数据获取完毕---") return brands
-
load_car_series.py
import constans from lxml import etree from constans import isErrorDateByA def parse_series(browser, brands): print("车系:") for brand in brands: browser.get(constans.host + brand['url']) page_html = browser.page_source series = parse(page_html) brand['seriesList'] = series print("---车系数据获取完毕---") def parse(page_html): series = [] selector = etree.HTML(page_html) seriesNodes = selector.xpath("/html/body/div[@class='list-screening-wrap content']/div[@id='listfilterstart']/div[" "@class='condition-list condition-series fn-clear js-screening-up']/div[" "@class='screening-margin " "fn-clear']/div[@class='screening-base fn-clear']/a") for node in seriesNodes: # 车系数据 if isErrorDateByA(node): continue series.append({ "name": node.xpath("./text()")[0], "url": node.xpath("./@href")[0] }) print(node.xpath("./text()")[0]) return series # return series
-
constans.py
host = "https://www.che168.com" # 品牌数据校验 def isErrorDateByA(bror): url = bror.xpath("./@href") name = bror.xpath('./text()') if url is None or len(url) == 0 or name is None or len(name) == 0: return True return False def isCarErrorDate(node): name = node.xpath("./a[@class='carinfo']/div[@class='cards-bottom']/h4[@class='card-name']/text()") price = node.xpath("./a[@class='carinfo']/div[@class='cards-bottom']/div[@class='cards-price-box']/span[" "@class='pirce']/em/text()") src = node.xpath("./a[@class='carinfo']/div[@class='img-box ']/img/@src") if name is None or len(name) == 0 or price is None or len(price) == 0 or src is None or len(src) == 0: return True return False
-
完整代码地址:https://github.com/szhig/py_car_date.git