爬虫selenium

升级pip
python -m pip install --upgrade pip
安装
pip install selenium  

selenium 使用

WebDriver对象是整个浏览器

bro=webdriver.chrome(executable_path='./chromedriver')
bro.get('https://www.taobao.com/')
定位到输入框
search_input=bro.find_element_by_id('q')
search_input.send_key('Iphone')

定位到按钮
btn=bro.find_element_by_css_selector('.btn-search')
btn.click()
xpth定位
bro=find_element(By.xpth,'')
'//input[@auto="off"]'
'//input[@auto="off" and/or @标签="属性"]'

文本定位
'//span[text()="文字内容"]'
'//*[contains(text(),"部分文本")]'

父级路径
'//div[text()="免费资源"]/..'
同级资源
'//div[text()="免费资源"]/../div[2]'
多选
//option|//a

css不能选择父节点
用/..表示
兄弟节点用::
[]/following-sibling::*所有的
[]/following-sibling::div
往上选择兄弟节点
[]/preceding-sibling::div



WebElement对象是元素范围内部
elem=bro.find_element()#进行二次查找

xpath('.//p')前面要加.没有则是全局网页

#隐式等待,每隔半秒查找一次,最大等待时长
bro.implicitly_wait(10)


#回退,前进
bro.back()
bro.forward()
#刷新页面
bro.refresh()     

css查找
find_elements_by_css_selector('#serch')
#+内容
class. id# 统一选择用[]
.+内容
滚轮
父子关系
父元素 > 子元素
find_elements_by_css_selector('#serch #inn>span')

div[name='sknet'] div [name='sknet']有空格上级元素

两个类型选择加,不能加()
#a > .plant,#a > .anim
按次序选择,选择第二子节点
span:nth-child(2)
倒数第2个选择
span:nth-last-child(2)
按类型选择
span:nth-of-type(1)
兄弟节点
h3 ~ span
#t1 h3~span
选择相邻的兄弟节点用+
h3 + span


from selenium.webdriver.common.action_chains import ActionChains

ac=ActionChains(bro)
#鼠标去移动到元素上
ac.move_to_element(
	bro.find_element_by_css_selector()
).perform()5面定住界面
setTimeout(function(){debug},5000)

alert 通知对话框

confirm 确认取消

prompt 操作对话框
对话框内容文字
bro.switch_to_alert.text
点击确认
bro.switch_to_alert.accept()
点击取消
.dismess()
输入内容
bro.switch_to_alert.send_keys('12313123')


常用操作
get()
clear()清空操作
sendkeys()追加输入

断言

Chrome浏览器类似,设置其options:
download.default_directory:设置下载路径
profile.default_content_settings.popups:设置为 0 禁止弹出窗口
	options = webdriver.ChromeOptions()
    prefs = {'profile.default_content_settings.popups': 0, 'download.default_directory': 'd:\\'}
    options.add_experimental_option('prefs', prefs)
    driver = webdriver.Chrome(chrome_options=options)
    driver.maximize_window()<br><br>
    查找下载的文件
import os
a = os.listdir("d:")
for i in a:
   print(i)

ActionChains(dr).move_by_offset(200, 100).click().perform() # 鼠标左键点击, 200为x坐标, 100为y坐标
ActionChains(dr).move_by_offset(200, 100).context_click().perform() # 鼠标右键点击

案例:
“”"
便利文件夹
刷新网站
便利网站资源
与文件夹内的资源比对

“”"
import os
from pickle import TRUE
import random
import keyboard
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
import selenium.webdriver.support.expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from concurrent.futures import ThreadPoolExecutor
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import json
import requests
import win32gui
import pyautogui
from concurrent.futures import (ThreadPoolExecutor, as_completed)

AssetsPath = os.path.join(os.getcwd(), ‘substance’, “3d Assets”)

已下载资源

Assets = {}

下载地址

url = ‘https://substance3d.adobe.com/assets/allassets?free=true’

当前查找3d Assets文件夹,便利所有资源

def creatPath():
if not os.path.exists(AssetsPath):
os.makedirs(AssetsPath)
else:
print(‘文件夹已存在’)

creatPath()

便利文件夹及内容

def traversal():
for root, dirs, files in os.walk(AssetsPath):
for dir in dirs:
print(os.path.join(root, dir))
for file in files:
_f = file.split(’.’)[0]
_f = f.replace(’’, ‘’)
print(os.path.join(root, _f))
Assets.update({_f: ‘’})

traversal()
print(‘已下载’, len(Assets))

caps = DesiredCapabilities.CHROME
caps[‘goog:loggingPrefs’] = {‘performance’: ‘ALL’}

拉起网页

options = webdriver.ChromeOptions()
prefs = {
‘download.prompt_for_download’: True,
‘profile.default_content_settings.popups’: 1,
‘download.default_directory’: 1,
‘enableNetwork’: True
}

options.add_experimental_option(‘prefs’, prefs)
bro = webdriver.Chrome(executable_path=’./chromedriver’,
chrome_options=options, desired_capabilities=caps)
bro.set_page_load_timeout(1000)
bro.set_script_timeout(1000)

bro.get(url)

等待验证快捷键

keyboard.wait(’`’)

请求网页

bro.get(url)

遍历网页模块和地址

等待加载

Elements = []

/following-sibling::div

Elements.append(bro.find_element(By.XPATH, ‘//div[text()=“免费资源”]/…’))
Elements.extend(bro.find_elements(
By.XPATH, ‘//div[@class=“cat-group”]/div/a’)) # 模块资源

print(“模块个数~~”, len(Elements))

def loadAssets():
_MMs = []
for i in Elements:
_name = i.find_element(By.XPATH, ‘.//div[1]’) # 模块名
Mname = _name.get_attribute(“textContent”)

    _num = i.find_element(By.XPATH, './/div[2]')  # 个数
    Mnum = _num.get_attribute("textContent")

    # print("Mname:",Mname,"Mnum:",Mnum)

    href = i.get_attribute('href')  # 模块链接
    _MMs.append([Mname, Mnum, href])

return _MMs

MMs = loadAssets()

便利日志

def traversal_log(_log):
for i in _log:
if i.get(‘message’):
try:
message_dict = json.loads(i.get(‘message’))
file_download_url = message_dict.get(‘message’).get(
‘params’).get(‘url’) if message_dict else None
if file_download_url is not None:
p = file_download_url.split(’.’)
if p[-1] == ‘sbsar’ : #or p[-1] == ‘fbx’:
print(‘file_download_url’, file_download_url)
return file_download_url
except Exception as e:
print(str(e))

下载

def task(_url):
print(‘begin’)
try:
r = requests.get(_url)
print(‘连接成功’)
except Exception as e:
print(e)
_name = _url.split(’%’)[-1]
_name = _name[2:]
print(’…’,AssetsPath, _name)
path = os.path.join(AssetsPath, _name)
print(‘path’, path)
with open(path, “wb”) as f:
f.write(r.content)
print(‘down over’)

def run(v):
webAssButtons = []
Assbuttonold = []

ac = ActionChains(bro)

print('模块:', v[0])
# 清空检查列表
Assbuttonold.clear()
# keyboard.wait('`')
bro.implicitly_wait(30)
# 请求地址
print(v[2])
bro.get(v[2])

# WebDriverWait(bro,20).until(EC.presence_of_element_located((By.XPATH,'//span[text()="下载"]/..')))
downurls=[]
# 循环等待,读取个数等于模块个数跳出
while True:
#    break
    # 翻页

    if len(Assbuttonold) < int(v[1]):

    #js = "window.scrollTo(0,document.body.scrollTop=10000)"
        bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
        # bro.execute_script(js)
        # 等待3秒
        sleep(3)
        # 每次读取全部资源
        webAssButtons = bro.find_elements(By.XPATH, '//span[text()="下载"]/..')
        for i in webAssButtons:
            # 是否检查过
            if i not in Assbuttonold:
                Assbuttonold.append(i)
                sleep(0.05)
                # 资源名
                webAssName = i.find_element(
                    By.XPATH, './/../../../div[2]/div/div').get_attribute('textContent')
                webAssName = webAssName.replace(' ', '')
                webAssName = webAssName.lower()
                # 是否已下载
                if webAssName in Assets:
                    print(webAssName,'已存在')
                else:
                    while True:
                        try:
                            print(webAssName,'不存在')
                            k = i.find_element(By.XPATH, './/span')
                            ac.move_to_element(k).perform()
                            sleep(random.random()+1)
                            bro.execute_script('window.scrollBy(0,100)')
                            ac.move_to_element(k).perform()
                            ac.click(k).perform()
                            # 等待0.1获取返回日志
                            sleep(random.random()+1)
                            browser_log = bro.get_log('performance')
                            browser_log.reverse()
                            # print('browser_log',browser_log)

                            downurl = traversal_log(browser_log)
                            sleep(random.random()+1)
                            hw=win32gui.FindWindow(None,'另存为')
                            win32gui.SetForegroundWindow(hw)

                            pyautogui.keyDown('esc')

                            downurls.append(downurl)
                            # 加入线程
                            #print('downurl', downurl)
                            #sleep(1)
                            #pool.submit(task, downurl)
                            #sleep(0.1)
                            break
                        except Exception as e:
                            print('等待响应',str(e))
                            sleep(5)
            else:    
                print('已检查')
            print('num',len(Assbuttonold))
    else:
        break 

#downurls.append(' https://s3.us-west-2.amazonadf1d4e78f38ea63cce8525544?AWSAccessKeyId=AKIAUm70%2BGX2oT1PjhSGwYiRbIeY%3D&response-content-tte.fbx')
print('downurls',len(downurls))
return downurls

executor = ThreadPoolExecutor(max_workers=1)
pool = ThreadPoolExecutor(max_workers=10)

‘’’ print(MMs)
#task_list = [executor.submit(run, v) for v in _MM]
for result in executor.map(run,MMs):
print (‘result’,result)

for i in result:
    #j='https://s3.us-west-2.amazonaws.com/adobe-3di-substance-source/7ba51dbd586c6192bebe243dbe86dea1797b5d75?AWSAccessKeyId=AKIAUTBGVDZPARQRGSO7&Expires=1645781824&Signature=Kvuh2jeQck6ww2soEDdPOQA8%2BAQ%3D&response-content-disposition=attachment%3Bfilename%3Dstylized_balloon_paper.sbsar'
    pool.submit(task,i) '''

#for i in MMs:
i=MMs[10]
print(‘i’,i)
keyboard.wait(’]’)
p=pool.submit(run,i)
result=p.result()
for i in result:
pool.submit(task,i)

keyboard.wait(’[’)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值