python通过url下载文件不可读_python-selenium实现的简易下载器,并常见错误解决

简易下载器的实现

支持代理、失败重试、确保包含指定ID元素(可根据需求自定义修改)

# coding: utf-8

from Utils import logging

from bs4 import BeautifulSoup as bs

from selenium import webdriver

from selenium.webdriver.common.by import By

from selenium.webdriver.common.proxy import ProxyType

from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.support.ui import WebDriverWait

class HtmlDownloader:

def __init__(self):

self.driver = webdriver.PhantomJS()

def setProxy(self, proxyStr):

# 利用DesiredCapabilities(代理设置)参数值,重新打开一个sessionId

proxy=webdriver.Proxy()

proxy.proxy_type=ProxyType.MANUAL

proxy.http_proxy=proxyStr

# 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中

proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)

self.driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS)

def rmProxy(self):

# 还原为系统代理

proxy=webdriver.Proxy()

proxy.proxy_type=ProxyType.DIRECT

proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)

browser.start_session(webdriver.DesiredCapabilities.PHANTOMJS)

def download(self, returnType, url, ensureId, proxyStr = None):

if proxyStr:

self.setProxy(proxyStr)

else:

self.rmProxy()

self.driver.get(url)

# special for xxx.com

# your code here

# ensure for some element

try:

WebDriverWait(self.driver, 30).until(EC.presence_of_element_located((By.ID, ensureId)))

if returnType == "html":

downloadResult = self.driver.page_source

elif returnType == "bs":

downloadResult = bs(self.driver.page_source, 'lxml')

logging("i", "download %s bytes" % len(self.driver.page_source))

return downloadResult

except Exception,e:

logging("e", str(e))

finally:

self.driver.close()

def safeDownload(self, returnType, url, ensureId, proxyStr = None):

downloadResult = None

failTimes = 0

while not downloadResult:

downloadResult = self.download(returnType, url, ensureId, proxyStr)

if not downloadResult:

failTimes += 1

if failTimes == 5:

logging("w", "failed %s times, will abort" % failTimes)

break

logging("w", "failed %s times, will retry" % failTimes)

return downloadResult

元素不可见导致不能操作的错误

# ElementNotVisibleException: Message: {"errorMessage":"Element is not currently visible and may not be manipulated"

# Screenshot: available via screen

首先尝试设定窗口大小

self.driver.set_window_size(1024, 768)

不行的话再尝试滚动页面,如滚动到底部:

driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值