案例一
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import random
import pymongo
class WeiPinHui:
def __init__(self):
self.db = pymongo.MongoClient(host='localhost', port=27017)
self.collection = self.db.python.weipinhui
self.url = 'https://www.vip.com'
options = webdriver.ChromeOptions()
options.add_argument('--disable-blink-features=AutomationControlled') # 屏蔽检测,防止发现是浏览器驱动操作
# 隐藏 'Chrome正在受到自动软件的控制'
options.add_experimental_option('useAutomationExtension', False) # 去掉开发者警告
options.add_experimental_option('excludeSwitches', ['enable-automation']) # 去掉黄条
options.add_experimental_option('detach', True) # 不自动关闭浏览器,多次调试代码后容易卡机甚至死机,不建议添加,要添加的话,后面代码最好实现异常处理机制,在机制中实现browser.quit()
# 不加载图片,提高selenium效率
prefs = {"profile.managed_default_content_settings.images": 2}
options.add_experimental_option('prefs', prefs)
self.browser = webdriver.Chrome(options=options)
self.browser.maximize_window() # 浏览器窗口最大化
def into_page(self):
try:
self.browser.get(self.url)
wait = WebDriverWait(self.browser, 7)
# print(self.browser.page_source)
# input = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'c-search-input J-search-input'))) # By.CLASS_NAME不行,可以使用其他方法,注意多类名的空格
input = wait.until(EC.presence_of_element_located((By.XPATH, '//input[@class="c-search-input J-search-input"]')))
button = wait.until(EC.element_to_be_clickable((By.XPATH, '//a[@class="c-search-button J-search-button J_fake_a"]')))
input.send_keys('口红')
time.sleep(3) # 有必要延迟一定时间,传入关键字后等待网页加载完全
button.click()
# time.sleep(3)
except Exception as e:
self.browser.quit()
def drop_down(self):
for i in range(1, 13):
# j = i / 10
# js = f'document.documentElement.scrollTop = document.documentElement.scrollHeight * {j}'
js = f'document.documentElement.scrollTop = {i * 1000}'
self.browser.execute_script(js)
time.sleep(random.randint(500, 800) / 1000)
def parse_data(self):
self.drop_down()
# print(self.browser.page_source)
divs = self.browser.find_elements(By.XPATH, '//div[@class="c-goods-item J-goods-item c-goods-item--auto-width"]')
for i in divs:
item = {}
sale_price = i.find_element(By.XPATH, './/div[@class="c-goods-item__sale-price J-goods-item__sale-price"]').text
try:
discount = i.find_element(By.XPATH, './/div[@class="c-goods-item__discount J-goods-item__discount"]').text
except:
discount = '空'
try:
good_name = i.find_element(By.XPATH, './/div[@class="c-goods-item__name c-goods-item__name--two-line"]').text
except:
good_name = '空'
item['sale_price'] = sale_price
item['discount'] = discount
item['good_name'] = good_name
print(item)
self.save_data(item)
self.next_page()
def next_page(self):
try:
button = self.browser.find_element(By.XPATH, '//*[@id="J_nextPage_link"]')
if button:
button.click()
self.parse_data()
except Exception as e:
print('报错信息:', repr(e))
self.browser.close()
def save_data(self, item):
self.collection.insert_one(item)
print('插入成功')
def main(self):
self.into_page()
self.parse_data()
self.browser.quit()
self.db.close()
if __name__ == '__main__':
wph = WeiPinHui()
wph.main()
案例二
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pymongo
class SuNingYiGou:
def __init__(self):
self.url = 'https://search.suning.com/%E7%94%B5%E8%84%91/'
self.db = pymongo.MongoClient(host='localhost', port=27017)
self.connection = self.db.python.snyg
options = webdriver.ChromeOptions()
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_experimental_option('useAutomationExtension', False) # 去掉开发者警告
options.add_experimental_option('excludeSwitches', ['enable-automation']) # 去掉黄条
prefs = {"profile.managed_default_content_settings.images": 2}
options.add_experimental_option('prefs', prefs)
self.browser = webdriver.Chrome(options=options)
self.browser.maximize_window()
def into_page(self):
self.browser.get(self.url)
wait = WebDriverWait(self.browser, 5)
try:
wait.until(EC.element_to_be_clickable((By.ID, 'nextPage')))
except Exception as e:
print('报错:', e)
self.browser.quit()
else:
self.parse_data()
def drop_down(self):
for i in range(1, 13):
js = f'document.documentElement.scrollTop = {i * 1000}'
self.browser.execute_script(js)
time.sleep(random.randint(1, 2) / 5)
def parse_data(self):
self.drop_down()
# print(self.browser.page_source)
li_list = self.browser.find_elements(By.XPATH, '//ul[@class="general clearfix"]/li')
for li in li_list:
item = {}
item['price'] = li.find_element(By.XPATH, './/span[@class="def-price"]').text
item['title'] = li.find_element(By.XPATH, './/div[@class="title-selling-point"]/a').text
try:
item['evaluatenum'] = li.find_element(By.XPATH, './/div[@class="evaluate-old clearfix"]/div/a/i').text
except:
item['evaluatenum'] = '空'
try:
item['discount'] = ' '.join([i.text for i in li.find_elements(By.XPATH, './/div[@class="sales-label"]/span')])
except:
item['discount'] = '空'
# print(item)
self.save_data(item)
self.next_page()
def save_data(self, item):
print(item)
self.connection.insert_one(item)
print('插入成功')
def next_page(self):
try:
button = self.browser.find_element(By.ID, 'nextPage')
button.click()
self.parse_data()
except Exception as e:
print('无下一页', repr(e))
def main(self):
self.into_page()
self.browser.quit()
self.db.close()
if __name__ == '__main__':
snyg = SuNingYiGou()
snyg.main()