python爬取汽车之家_python爬虫实战 爬取汽车之家上车型价格

本文是关于使用Python爬虫从汽车之家网站抓取车型价格的实战教程。通过结合`pymysql`、`BeautifulSoup`和`selenium`等库,实现对在售和停售车型价格的抓取。文章详细介绍了如何处理数据库交互、网页元素定位以及异常处理,以确保爬取的准确性。
摘要由CSDN通过智能技术生成

python爬虫实战 爬取汽车之家上车型价格

发布时间:2018-08-28 21:50,

浏览次数:448

, 标签:

python

<>相关库

import pymysql import pymysql.cursors from bs4 import BeautifulSoup import

requestsimport random import time from selenium import webdriver from selenium.

webdriver.common.by import By from selenium.webdriver.support.ui import

WebDriverWaitfrom selenium.webdriver.support import expected_conditions as EC

import codecs from selenium.common.exceptions import TimeoutException

<>从数据库中读取车型(车型已经存放再数据库,这里读取车型的id,拼接到url上)

cars = [] conn = pymysql.connect(host='*******',charset='utf8',user=*******

',passwd='*****',db='mysql',cursorclass=pymysql.cursors.DictCursor) try: cur =

conn.cursor() cur.execute("USE data_etl") cur.execute("select

distinct(car_id),car_name from user_car_port") item = cur.fetchone() count = 0

while item is not None: cars.append(item) count+=1 item = cur.fetchone() print(

count) finally: conn.close()

<>由于汽车之家反爬比较复杂,我们直接调用浏览器接口

driver = webdriver.Chrome('chromedriver.exe') def getCarPriceOffSale(innerHtml)

: button = 0.0 top = 0.0 print("此车型已经停售!") bsObj = BeautifulSoup(innerHtml) try:

spanPrice= bsObj.findAll("span",{"class":"price"})[0] if spanPrice is not None:

strongPrice= spanPrice.find("strong",{"class":"red"}) if strongPrice is not

None: text = strongPrice.text if text is not None: prices = text.split("-")

prices= text.split("-") prices[0] = prices[0].replace("万","") prices[0] = prices

[0].replace("元","") button = float(prices[0]) if(len(prices) == 2): prices[1] =

prices[1].replace("万","") prices[1] = prices[1].replace("元","") top = float(

prices[1]) else: top = button else: print("价格字段为空") else: print("价格strong为空")

else: print("价格span为空") except Exception: print("程序出错!停售车型") return button,top

<>处理在售车型的价格 信息

def getCarPriceOnSale(innerHtml): button = 0.0 top = 0.0 print("此车型在售") bsObj =

BeautifulSoup(innerHtml) try: ddprice = bsObj.findAll("dd")[0] if ddprice is

not None: a = ddprice.find("a",{"class":"emphasis"}) if a is not None: text = a.

text prices= text.split("-") prices[0] = prices[0].replace("万","") prices[0] =

prices[0].replace("元","") button = float(prices[0]) if(len(prices) == 2): prices

[1] = prices[1].replace("万","") prices[1] = prices[1].replace("元","") top =

float(prices[1]) else: top = button else: print("此车型暂时无法查询价格") except Exception:

print("程序出错!在售车型") return button,top

<>处理停售车型的价格信息

def getCarPrice(carId): button = 0.0 top = 0.0 try: driver.get(url+str(carId))

wait= WebDriverWait(driver,5).until(EC.presence_of_element_located((By.

CLASS_NAME,"information-summary"))) ele = driver.find_element_by_class_name(

"information-price").get_attribute('innerHTML') button,top=getCarPriceOnSale(ele

) except TimeoutException: try: wait = WebDriverWait(driver,5).until(EC.

presence_of_element_located((By.CLASS_NAME,"car_price"))) ele = driver.

find_element_by_class_name("car_price").get_attribute('innerHTML') button,top=

getCarPriceOffSale(ele) except TimeoutException: print("此车型有问题:"+str(carId))

return button,top

<>遍历数据库所有车型的id

for car in cars: id = car["car_id"] time.sleep(random.randint(1,5)) button,top

= getCarPrice(id) if button == 0.0 and top == 0.0: car["button"] = 9999 car[

"top"] = 9999 else: car["button"] = button car["top"] = top

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值