selenium爬虫

world_in_world

已于 2023-05-01 17:20:38 修改

阅读量321

点赞数

分类专栏： python爬虫文章标签： python 爬虫 Powered by 金山文档

于 2023-04-14 17:25:01 首次发布

本文链接：https://blog.csdn.net/world_in_world/article/details/130158488

版权

python爬虫专栏收录该内容

18 篇文章 1 订阅

订阅专栏

这两段代码展示了如何使用Python的Selenium库配合Chrome浏览器进行网页抓取。首先初始化Chrome选项以避免自动化检测，然后访问电商平台如唯品会和苏宁易购，滚动页面抓取多页商品数据，包括价格、名称和折扣等信息。数据抓取后存储到MongoDB数据库中。

摘要由CSDN通过智能技术生成

案例一

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import random
import pymongo


class WeiPinHui:
    def __init__(self):
        self.db = pymongo.MongoClient(host='localhost', port=27017)
        self.collection = self.db.python.weipinhui
        self.url = 'https://www.vip.com'

        options = webdriver.ChromeOptions()

        options.add_argument('--disable-blink-features=AutomationControlled')  # 屏蔽检测，防止发现是浏览器驱动操作

        # 隐藏 'Chrome正在受到自动软件的控制'
        options.add_experimental_option('useAutomationExtension', False)  # 去掉开发者警告
        options.add_experimental_option('excludeSwitches', ['enable-automation'])  # 去掉黄条

        options.add_experimental_option('detach', True)  # 不自动关闭浏览器，多次调试代码后容易卡机甚至死机，不建议添加，要添加的话，后面代码最好实现异常处理机制，在机制中实现browser.quit()

        # 不加载图片，提高selenium效率
        prefs = {"profile.managed_default_content_settings.images": 2}
        options.add_experimental_option('prefs', prefs)

        self.browser = webdriver.Chrome(options=options)
        self.browser.maximize_window()  # 浏览器窗口最大化


    def into_page(self):
        try:
            self.browser.get(self.url)
            wait = WebDriverWait(self.browser, 7)
            # print(self.browser.page_source)
            # input = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'c-search-input  J-search-input')))  # By.CLASS_NAME不行，可以使用其他方法，注意多类名的空格
            input = wait.until(EC.presence_of_element_located((By.XPATH, '//input[@class="c-search-input  J-search-input"]')))
            button = wait.until(EC.element_to_be_clickable((By.XPATH, '//a[@class="c-search-button  J-search-button  J_fake_a"]')))
            input.send_keys('口红')
            time.sleep(3)  # 有必要延迟一定时间，传入关键字后等待网页加载完全
            button.click()
            # time.sleep(3)
        except Exception as e:
            self.browser.quit()


    def drop_down(self):
        for i in range(1, 13):
            # j = i / 10
            # js = f'document.documentElement.scrollTop = document.documentElement.scrollHeight * {j}'
            js = f'document.documentElement.scrollTop = {i * 1000}'
            self.browser.execute_script(js)
            time.sleep(random.randint(500, 800) / 1000)


    def parse_data(self):
        self.drop_down()
        # print(self.browser.page_source)
        divs = self.browser.find_elements(By.XPATH, '//div[@class="c-goods-item  J-goods-item c-goods-item--auto-width"]')
        for i in divs:
            item = {}
            sale_price = i.find_element(By.XPATH, './/div[@class="c-goods-item__sale-price J-goods-item__sale-price"]').text
            try:
                discount = i.find_element(By.XPATH, './/div[@class="c-goods-item__discount  J-goods-item__discount"]').text
            except:
                discount = '空'
            try:
                good_name = i.find_element(By.XPATH, './/div[@class="c-goods-item__name  c-goods-item__name--two-line"]').text
            except:
                good_name = '空'
            item['sale_price'] = sale_price
            item['discount'] = discount
            item['good_name'] = good_name
            print(item)
            self.save_data(item)
        self.next_page()


    def next_page(self):
        try:
            button = self.browser.find_element(By.XPATH, '//*[@id="J_nextPage_link"]')
            if button:
                button.click()
                self.parse_data()
        except Exception as e:
            print('报错信息：', repr(e))
            self.browser.close()


    def save_data(self, item):
        self.collection.insert_one(item)
        print('插入成功')


    def main(self):
        self.into_page()
        self.parse_data()
        self.browser.quit()
        self.db.close()


if __name__ == '__main__':
    wph = WeiPinHui()
    wph.main()

案例二

import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pymongo


class SuNingYiGou:
    def __init__(self):
        self.url = 'https://search.suning.com/%E7%94%B5%E8%84%91/'
        self.db = pymongo.MongoClient(host='localhost', port=27017)
        self.connection = self.db.python.snyg
        options = webdriver.ChromeOptions()
        options.add_argument('--disable-blink-features=AutomationControlled')
        options.add_experimental_option('useAutomationExtension', False)  # 去掉开发者警告
        options.add_experimental_option('excludeSwitches', ['enable-automation'])  # 去掉黄条
        prefs = {"profile.managed_default_content_settings.images": 2}
        options.add_experimental_option('prefs', prefs)
        self.browser = webdriver.Chrome(options=options)
        self.browser.maximize_window()


    def into_page(self):
        self.browser.get(self.url)
        wait = WebDriverWait(self.browser, 5)
        try:
           wait.until(EC.element_to_be_clickable((By.ID, 'nextPage')))
        except Exception as e:
            print('报错：', e)
            self.browser.quit()
        else:
            self.parse_data()


    def drop_down(self):
        for i in range(1, 13):
            js = f'document.documentElement.scrollTop = {i * 1000}'
            self.browser.execute_script(js)
            time.sleep(random.randint(1, 2) / 5)


    def parse_data(self):
        self.drop_down()
        # print(self.browser.page_source)
        li_list = self.browser.find_elements(By.XPATH, '//ul[@class="general clearfix"]/li')
        for li in li_list:
            item = {}
            item['price'] = li.find_element(By.XPATH, './/span[@class="def-price"]').text
            item['title'] = li.find_element(By.XPATH, './/div[@class="title-selling-point"]/a').text
            try:
                item['evaluatenum'] = li.find_element(By.XPATH, './/div[@class="evaluate-old clearfix"]/div/a/i').text
            except:
                item['evaluatenum'] = '空'
            try:
                item['discount'] = ' '.join([i.text for i in li.find_elements(By.XPATH, './/div[@class="sales-label"]/span')])
            except:
                item['discount'] = '空'
            # print(item)
            self.save_data(item)
        self.next_page()


    def save_data(self, item):
        print(item)
        self.connection.insert_one(item)
        print('插入成功')


    def next_page(self):
        try:
            button = self.browser.find_element(By.ID, 'nextPage')
            button.click()
            self.parse_data()
        except Exception as e:
            print('无下一页', repr(e))


    def main(self):
        self.into_page()
        self.browser.quit()
        self.db.close()


if __name__ == '__main__':
    snyg = SuNingYiGou()
    snyg.main()