实战目标:爬取商品信息,包括商品的价格、主图、详情图。
遇到的问题及解决方案:
- 1.首先去获取已经保存的cookie值-以免需要重复登录
- 2.抓取价格老是为空–首先反复刷新获取价格
- 3.每次定位元素加了重试机制
- 4.最后实在抓取不成功的,BaseExceptionHandler处理
- 5.BaseExceptionHandler处理不同的异常定义了不同的异常处理方案
- 6.获取主图用到了execute_script
import os
import pickle
import random
import time
from utils.yamltools import ReadYamlRender
from selenium import webdriver
from selenium.common import NoSuchElementException
from selenium.webdriver import Keys, ActionChains
from testapi.ExceptSelenium import BaseExceptionHandler
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
class SeleniumDriver(BaseExceptionHandler):
def __init__(self):
self.ReadYamlRender = ReadYamlRender()
super().__init__()
@BaseExceptionHandler.handle_exceptions
def browser_type(self, url):
self.driver.get(url)
self.driver.maximize_window()
time.sleep(5)
cookie_file = '../data/cookie.pkl'
with open(cookie_file, 'rb') as f:
data = pickle.load(f)
result = next((item for item in data if item['name'] == 'aaa'), None)
print("获取的cookie=======", result)
self.driver.add_cookie(result)
self.driver.get("https://item.jd.com/100000541163.html")
time.sleep(random.randint(1, 10))
self.driver.switch_to.default_content()
actions = ActionChains(self.driver)
actions.key_down(Keys.CONTROL)
actions.send_keys(Keys.F5)
actions.key_up(Keys.CONTROL)
actions.perform()
price_found = False
i = 0
while not price_found:
try:
price = self.locate_element("xpath", "//*[@class='p-price']/span[2]").text
if price:
price_found = True
else:
minu = random.randint(1, 10)
print(f"价格为空,正在重新加载页面...,下一次刷新在{minu}秒后")
self.driver.refresh()
time.sleep(minu)
continue
except NoSuchElementException:
print("价格元素未找到,正在重新加载页面...")
self.login(self.ReadYamlRender.test_get_value("jd_name"), self.ReadYamlRender.test_get_value("jd_pwd"))
i += 1
if i > 2:
price_found = True
continue
price = self.locate_element("xpath", "//*[@class='p-price']/span[2]").text
title = self.locate_element("xpath", "//*[@class='sku-name']").text
img_url = "https:" + self.locate_element("xpath", "//*[@id='spec-img']").get_attribute('data-origin')
img_urls = [element.get_attribute('src') for element in
self.driver.find_elements("xpath", "//*[@id='spec-list']/ul/li/img")]
detail_img_urls = [self.driver.execute_script(
'var style = window.getComputedStyle(arguments[0], null);'
'var background_image = style.getPropertyValue("background-image");'
'return background_image.replace(/^url\((.*)\)$/, "$1");', element)
for element in self.driver.find_elements("xpath", "//*[@id='J-detail-content']/div[4]/div")]
print("详情图====", detail_img_urls)
print("主图====", img_urls)
print(f"title:{title}\nimg_url:{img_url}\nprice:{price}")
self.locate_element("xpath", "//*[@id='detail']/div[1]/ul/li[2]").click()
models = self.locate_element("xpath", "//*[@id='detail']/div[2]/div[2]/div[1]/p[3]").text
print("型号====", models)
def locate_element(self, method, element_locator):
retry_count = 3
while retry_count > 0:
try:
return self.driver.find_element(method, element_locator)
except NoSuchElementException:
print(f"Unable to locate element: {element_locator}")
retry_count -= 1
if retry_count <= 0:
raise NoSuchElementException(f"Unable to locate element: {element_locator}")
minu = random.randint(1, 10)
print(f"{element_locator}为空,正在重新加载页面...,下一次刷新在{minu}秒后")
self.driver.refresh()
time.sleep(minu)
return None
def login(self, name, passwd):
username_field = self.locate_element("id", 'loginname')
password_field = self.locate_element("id", 'nloginpwd')
username_field.send_keys(name)
password_field.send_keys(passwd)
password_field.send_keys(Keys.RETURN)
time.sleep(10)
cookies = self.driver.get_cookies()
cookie_file = '../data/cookie.pkl'
with open(cookie_file, 'wb') as f:
pickle.dump(cookies, f)
with open(cookie_file, 'rb') as f:
data = pickle.load(f)
print("全部的cookie=======", data)
if __name__ == '__main__':
selenium_driver = SeleniumDriver()
selenium_driver.browser_type(url="https://passport.jd.com/new/login.aspx?ReturnUrl=https%3A%2F%2Fwww.jd.com%2F")
import functools
import random
import time
from functools import wraps
from selenium import webdriver
from selenium.common import NoSuchElementException
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
class BaseExceptionHandler:
exception_handlers = {}
driver = None
def __init__(self):
global driver
options = webdriver.ChromeOptions()
options.add_argument('--disable-blink-features=AutomationControlled')
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
BaseExceptionHandler.driver = driver
BaseExceptionHandler.exception_handlers = {
NoSuchElementException: self.NoSuchElement_error
}
@classmethod
def handle_exceptions(cls, func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except Exception as e:
print("遇到了什么类型的错误========", type(e))
if type(e) in cls.exception_handlers:
print("==============")
handler = cls.exception_handlers.get(type(e), cls.handle_default_exception)
handler(e)
return wrapper
@classmethod
def handle_default_exception(cls, e):
print("Error occurred: ", str(e))
minu = random.randint(1, 10)
time.sleep(minu)
print(f"走到了基处理类中...,下一次刷新在{minu}秒后")
BaseExceptionHandler.driver.refresh()
@classmethod
def NoSuchElement_error(cls, e):
print("NoSuchElementException occurred:")