基于python中selenium模块完成百度文库pdf文档下载

最新推荐文章于 2022-07-08 14:07:07 发布

Theseus丶

最新推荐文章于 2022-07-08 14:07:07 发布

阅读量424

点赞数 1

文章标签：爬虫 python

本文链接：https://blog.csdn.net/weixin_59505571/article/details/119829385

版权

本文介绍了一个Python爬虫项目，用于绕过百度文库的会员限制下载PDF文档。通过结合requests、selenium和aiohttp库，实现了模拟登录、页面滚动加载和图片转PDF的过程。代码中包括了登录、页面滑动、提取图片URL以及异步下载图片并保存为jfif格式的步骤。

摘要由CSDN通过智能技术生成

作为一个爬虫新手，近日有百度文库文档的下载需求，但百度文库默认需要会员才可下载文档，于是乎我想到写一个爬虫来获取百度文库中pdf类型的文档，下面是具体的思考过程及代码：

首先考虑用requests模块直接抓取网页源代码，可百度文库网页源代码中并不直接包含内容，于是采用检查找关于原文档的信息。

可以看到，文档的每一页在百度文库网页中是以图片形式加载出来的，于是可以利用selenium进行处理。

import re
import requests
import time
from selenium.webdriver import Chrome
import asyncio
import aiohttp
import aiofiles

web = Chrome()
tasks = []
def sign_in(username,password):
    web.get(url)
    time.sleep(4)
    denglu1 = web.find_element_by_xpath('//*[@id="app"]/div[1]/div/div/div[4]/div[4]/div/div[1]')#查找登录按钮
    denglu1.click()
    time.sleep(3)
    denglu = web.find_element_by_xpath('//*[@id="TANGRAM__PSP_11__footerULoginBtn"]')#点击用户名登录
    denglu.click()
    time.sleep(2)
    web.find_element_by_xpath('//*[@id="TANGRAM__PSP_11__userName"]').send_keys(f'{username}')#填入用户名
    web.find_element_by_xpath(('//*[@id="TANGRAM__PSP_11__password"]')).send_keys(f'{password}')#填入密码
    web.find_element_by_xpath(('//*[@id="TANGRAM__PSP_11__submit"]')).click()
    time.sleep(4)
def click_continue():
    js2 = 'window.scrollBy(0,2050)'#将页面滑到继续查看位置
    web.execute_script(js2)
    time.sleep(3)
    button = web.find_element_by_xpath('//*[@id="app"]/div[2]/div[1]/div[2]/div[2]/div[1]/div[3]')
    button.click()#点击继续查看按钮
    time.sleep(3)
def move_to_top():
    js1 = "document.documentElement.scrollTop=0"#将窗口移到最上方
    web.execute_script(js1)
    time.sleep(1)

def scroll_to_bottom(web):
    #由于百度文库网页不滑动不加载内容，所以使用本函数完成从网页最上方移到最下方，完成所有内容的加载工作
    js = "return action=document.body.scrollHeight"
    # 初始化现在滚动条所在高度为0
    height = 0
    # 当前窗口总高度
    new_height = web.execute_script(js)
    while height < new_height:
        for i in range(height, new_height, 100):
            web.execute_script('window.scrollTo(0, {})'.format(i))
            time.sleep(0.5)
        height = new_height
        time.sleep(1.5)
        new_height = web.execute_script(js)
async def get_url():
    obj1 = re.compile(r'ppt-image-wrap ppt-4-3".*?src="(?P<url>.*?)"', re.S)#利用正则表达式提取img src内容
    result = obj1.finditer(web.page_source)
    name = 0
    for i in result:
        name += 1
        download_url = i.group('url')
        download_url = download_url.replace('amp;','')
        tasks.append(download(str(name),download_url))
    await asyncio.wait(tasks)
async def download(name,url):
    async with aiohttp.request('GET',url) as f:
        resp = await f.read()
        async with aiofiles.open(f'{name}.jfif',mode='wb') as fp:
            await fp.write(resp)



if __name__ == '__main__':
    url = input('输入想要下载的百度文库地址：')
    username = input('请输入你的百度账号：')#对用户名进行获取
    password = input('请输入你的密码：')#对密码进行获取
    sign_in(username,password)#将用户名密码传输，为下一步登录操作做准备
    click_continue()
    scroll_to_bottom(web)
    asyncio.run(get_url())

效果如下：