怎么用python爬取百度文库ppt_python使用selenium爬百度文库ppt并生成pdf

from selenium importwebdriverfrom selenium.common.exceptions importTimeoutExceptionfrom selenium.common.exceptions importNoSuchElementExceptionfrom selenium.webdriver.common.by importByfrom selenium.webdriver.support importexpected_conditions as ECfrom selenium.webdriver.support.wait importWebDriverWaitfrom selenium.webdriver.common.action_chains importActionChainsimporttimeimportreimportrequestsclassdownloader:def __init__(self):

self.browser=webdriver.Chrome()

self.wait=wait = WebDriverWait(self.browser,3)

self.i=0

self.pattern=re.compile('.*?url\("(.*?)"\)',re.S)def __call__(self,url):

self.download(url)whileTrue:for i inself.parse_link():

self.save(i)

sub=self.browser.find_element_by_id('next-pageList-1')

self.browser.execute_script("arguments[0].scrollIntoViewIfNeeded(true);",sub)

sub.click()

self.browser.quit()defdownload(self,url):

self.browser.get(url)

submit=self.wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="html-reader-go-more"]/div[2]/div[1]/span/span[1]')))

self.browser.execute_script("arguments[0].scrollIntoViewIfNeeded(true);",submit)

submit.click()defparse_link(self):

self.elem=self.wait.until(EC.presence_of_element_located((By.ID,'reader-container-inner-1')))for i in self.elem.find_elements_by_class_name('bd'):try:

self.browser.execute_script("arguments[0].scrollIntoViewIfNeeded(true);",i)

time.sleep(0.6)

i=i.find_element_by_class_name('reader-pic-item')

js=i.get_attribute('style')

href=self.pattern.findall(js)yieldhref[0]exceptNoSuchElementException:continue

defsave(self,link):

html=requests.get(link).content

with open('{}.png'.format(self.i),'wb') as f:

f.write(html)

self.i+=1D=downloader()

D('https://wenku.baidu.com/view/d86fe3436c175f0e7dd13731')

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值