selenium实战之Python+Selenium爬取商品数据

实战目标:爬取商品信息,包括商品的价格、主图、详情图。

遇到的问题及解决方案:

  • 1.首先去获取已经保存的cookie值-以免需要重复登录
  • 2.抓取价格老是为空–首先反复刷新获取价格
  • 3.每次定位元素加了重试机制
  • 4.最后实在抓取不成功的,BaseExceptionHandler处理
  • 5.BaseExceptionHandler处理不同的异常定义了不同的异常处理方案
  • 6.获取主图用到了execute_script
# -*- coding:UTF-8 -*-
import os
import pickle
import random
import time
from utils.yamltools import ReadYamlRender
from selenium import webdriver
from selenium.common import NoSuchElementException
from selenium.webdriver import Keys, ActionChains

from testapi.ExceptSelenium import BaseExceptionHandler
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager


class SeleniumDriver(BaseExceptionHandler):
    def __init__(self):
        self.ReadYamlRender = ReadYamlRender()
        super().__init__()

    @BaseExceptionHandler.handle_exceptions
    def browser_type(self, url):
        self.driver.get(url)
        self.driver.maximize_window()
        time.sleep(5)
        cookie_file = '../data/cookie.pkl'
        with open(cookie_file, 'rb') as f:
            data = pickle.load(f)
        result = next((item for item in data if item['name'] == 'aaa'), None)
        print("获取的cookie=======", result)
        self.driver.add_cookie(result)
        self.driver.get("https://item.jd.com/100000541163.html")
        time.sleep(random.randint(1, 10))
        self.driver.switch_to.default_content()
        actions = ActionChains(self.driver)
        actions.key_down(Keys.CONTROL)
        actions.send_keys(Keys.F5)
        actions.key_up(Keys.CONTROL)
        actions.perform()
        price_found = False
        i = 0
        while not price_found:
            try:
                price = self.locate_element("xpath", "//*[@class='p-price']/span[2]").text
                if price:
                    price_found = True
                else:
                    minu = random.randint(1, 10)
                    print(f"价格为空,正在重新加载页面...,下一次刷新在{minu}秒后")
                    self.driver.refresh()
                    time.sleep(minu)
                    continue
            except NoSuchElementException:
                print("价格元素未找到,正在重新加载页面...")
                self.login(self.ReadYamlRender.test_get_value("jd_name"), self.ReadYamlRender.test_get_value("jd_pwd"))
                i += 1
                if i > 2:
                    price_found = True
                continue
        price = self.locate_element("xpath", "//*[@class='p-price']/span[2]").text
        title = self.locate_element("xpath", "//*[@class='sku-name']").text
        img_url = "https:" + self.locate_element("xpath", "//*[@id='spec-img']").get_attribute('data-origin')
        img_urls = [element.get_attribute('src') for element in
                    self.driver.find_elements("xpath", "//*[@id='spec-list']/ul/li/img")]
        detail_img_urls = [self.driver.execute_script(
            'var style = window.getComputedStyle(arguments[0], null);'
            'var background_image = style.getPropertyValue("background-image");'
            'return background_image.replace(/^url\((.*)\)$/, "$1");', element)
            for element in self.driver.find_elements("xpath", "//*[@id='J-detail-content']/div[4]/div")]
        print("详情图====", detail_img_urls)
        print("主图====", img_urls)
        print(f"title:{title}\nimg_url:{img_url}\nprice:{price}")
        self.locate_element("xpath", "//*[@id='detail']/div[1]/ul/li[2]").click()
        models = self.locate_element("xpath", "//*[@id='detail']/div[2]/div[2]/div[1]/p[3]").text
        print("型号====", models)

    def locate_element(self, method, element_locator):
        retry_count = 3
        while retry_count > 0:
            try:
                return self.driver.find_element(method, element_locator)
            except NoSuchElementException:
                print(f"Unable to locate element: {element_locator}")
                retry_count -= 1
                if retry_count <= 0:
                    raise NoSuchElementException(f"Unable to locate element: {element_locator}")
                minu = random.randint(1, 10)
                print(f"{element_locator}为空,正在重新加载页面...,下一次刷新在{minu}秒后")
                self.driver.refresh()
                time.sleep(minu)
        return None

    def login(self, name, passwd):
        username_field = self.locate_element("id", 'loginname')
        password_field = self.locate_element("id", 'nloginpwd')
        username_field.send_keys(name)
        password_field.send_keys(passwd)
        password_field.send_keys(Keys.RETURN)  # 模拟回车键,提交表单
        time.sleep(10)
        cookies = self.driver.get_cookies()
        cookie_file = '../data/cookie.pkl'
        with open(cookie_file, 'wb') as f:
            pickle.dump(cookies, f)
        with open(cookie_file, 'rb') as f:
            data = pickle.load(f)
            print("全部的cookie=======", data)


if __name__ == '__main__':
    selenium_driver = SeleniumDriver()
    selenium_driver.browser_type(url="https://passport.jd.com/new/login.aspx?ReturnUrl=https%3A%2F%2Fwww.jd.com%2F")

import functools
import random
import time
from functools import wraps

from selenium import webdriver
from selenium.common import NoSuchElementException
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager


class BaseExceptionHandler:
    exception_handlers = {}

    driver = None  # 声明为类属性

    def __init__(self):
        global driver  # 使用全局变量driver
        options = webdriver.ChromeOptions()
        options.add_argument('--disable-blink-features=AutomationControlled')
        driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
        BaseExceptionHandler.driver = driver  # 设置类属性
        BaseExceptionHandler.exception_handlers = {
            NoSuchElementException: self.NoSuchElement_error
        }


    @classmethod
    def handle_exceptions(cls, func):  # 注意这里的cls参数
        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            try:
                return func(*args, **kwargs)
            except Exception as e:
                print("遇到了什么类型的错误========", type(e))
                if type(e) in cls.exception_handlers:
                    print("==============")
                handler = cls.exception_handlers.get(type(e), cls.handle_default_exception)
                handler(e)

        return wrapper

    @classmethod
    def handle_default_exception(cls, e):  # 注意这里的cls参数
        print("Error occurred: ", str(e))
        # 在这里处理异常,例如刷新页面或进行其他操作
        minu = random.randint(1, 10)
        time.sleep(minu)
        print(f"走到了基处理类中...,下一次刷新在{minu}秒后")
        BaseExceptionHandler.driver.refresh()  # 使用类属性访问driver

    @classmethod
    def NoSuchElement_error(cls, e):
        print("NoSuchElementException occurred:")


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值