playwright 下载pdf

FOAF-lambda
已于 2023-07-05 10:16:09 修改
阅读量829
点赞数
文章标签： pdf python
于 2023-03-07 18:34:33 首次发布
本文链接：https://blog.csdn.net/lwdfzr/article/details/129388472
版权
该代码实现了一个Python脚本，利用Playwright库进行网页交互，包括模拟点击、元素判断和文件下载。主要功能是登录特定网站，查找并点击PDF下载链接，然后监控下载状态，最终将文件保存到本地。脚本还涉及到了WindowsAPI调用来模拟文件另存为的操作。
摘要由CSDN通过智能技术生成
import time

import requests
from playwright.sync_api import Playwright, sync_playwright, expect
from urllib.parse import unquote
import shutil


def get_md5(content):
    import hashlib
    m = hashlib.md5(content.encode())
    return m.hexdigest()


def save_file():
    # 点击文件另存为 保存文件到本地
    import win32gui,win32con
    hwnd = win32gui.FindWindow("#32770", "另存为")
    hwnd_save = win32gui.FindWindowEx(hwnd, None, "Button", None)
    win32gui.PostMessage(hwnd_save, win32con.WM_KEYDOWN, win32con.VK_RETURN, 0)
    win32gui.PostMessage(hwnd_save, win32con.WM_KEYUP, win32con.VK_RETURN, 0)

def js_element_click(driver, js_path, type=1):
    # js 元素点击
    if type == 1:
        # 元素路径需要双引号包裹
        js_path = js_path.replace('"', "'")
        driver.execute_script('''var temp=document.querySelector("%s");if(temp){temp.click()}''' % js_path)
    else:
        # 元素路径需要单引号包裹(模糊匹配)
        js_path = js_path.replace("'", '"')
        driver.execute_script('''var temp=document.querySelector('%s');if(temp){temp.click()}''' % js_path)

def str_to_num(strings):
    # 提取字符串中的数字，替换掉逗号
    import re
    req = re.compile('(-?\d*\,?\d*\,?\d*\,?\d*\,?\d+\.?\d*)')
    num = req.findall(str(strings))
    if num:
        num = num[0].replace(',', '')
    elif not strings:
        return 0
    else:
        num = strings
    return str(num)

def judge_element_exist(page, js_path, type=1):
    # 判断标签是否存在
    if type == 1:
        # 元素路径需要双引号包裹
        js_path = js_path.replace('"', "'")
        result = page.evaluate(
            '''var temp = document.querySelector("%s");if(temp){(function(){return true})()}else{(function(){return false})()}''' % js_path)
    else:
        # 元素路径需要单引号包裹(模糊查询时 'div.gh-menu > a[href*="/mys/home"]')
        js_path = js_path.replace("'", '"')
        result = page.evaluate('''var temp = document.querySelector('%s');if(temp){(function(){return true})()}else{(function(){return false})()}''' % js_path)
    print('result:', result)
    return result


def get_pdf_down_url(url_tiao):
    # 获取PDF重定向链接
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Connection': 'keep-alive',
        'Host': 'guide.medlive.cn',
        'Referer': 'https://guide.medlive.cn/',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Safari/537.36',
    }
    # url_tiao = 'https://guide.medlive.cn/guideline/full_text_link_redirect.php?l=CMzWD6BHlMVquezOt93sI1okuBNUTADoQDTCLnHwY914Ugeaoip%2Br8%2FTilxXZ%2Fl4LmWPHxmY8GnNotB59oS8PA%3D%3D&t=pRLU9F34HdrIJYk6oKmrsA%3D%3D'
    res = requests.get(url_tiao, headers=headers, allow_redirects=False)
    print(res.status_code)
    location_url = res.headers['Location']
    print('location_url:',location_url)
    pdf_url = str(location_url).replace('doi', 'doi/pdf') + '?download=true'
    print('pdf_url-------', pdf_url)
    return pdf_url


def get_pdf_down_status(page, pdf_url, pdf_save_path):
    # 获取文件下载状态，并迁移到指定文件夹下
    js = '''
            (function(){
                var temps = document.querySelector("body > downloads-manager").shadowRoot.querySelectorAll("downloads-item[id*='frb']");
                var dics = [];
                for(var temp=0;temp<temps.length;temp++){
                    var down_path = temps[temp].shadowRoot.querySelector("#file-icon").getAttribute('src');
                    var file_link = temps[temp].shadowRoot.querySelector("#file-link").getAttribute('href');
                    var cancel = temps[temp].shadowRoot.querySelector("#description").textContent;
                    var fail = temps[temp].shadowRoot.querySelector("#tag");
                    if(fail){fail=fail.textContent}else{fail=false}
                    dics.push({"down_path":down_path,"file_link":file_link, "cancel":cancel, "fail":fail})
                }
                return dics;
            })()
        '''
    results = page.evaluate(js)
    flug = False
    for result in results:
        file_link = result['file_link'].split('&')[0]
        if file_link[:50] != pdf_url[:50]:
            continue
        down_path = unquote(result['down_path']).split('path=')[-1].split('&')[0]
        cancel = result['cancel']
        fail = result['fail']
        print('down_path:', down_path)
        print('file_link:', file_link)
        print('cancel:', result['cancel'])
        print('fail:', result['fail'])
        if not str(cancel).replace("\n",'').replace(' ', '') and not fail:
            # cancel 为True或者有值表示未下载成功，fail为True或者有值表示下载失败，或者文件不存在已删除
            print('下载成功')
            # 将文件从下载路径迁移到指定路径
            shutil.copy(down_path, pdf_save_path)
            flug = True
    return flug


def username_login(page):
    page.goto("http://www.medlive.cn/auth/login?service=https%3A%2F%2Fguide.medlive.cn%2Fguideline%2F3896")
    # page.once("dialog", lambda dialog: dialog.dismiss())
    # page.get_by_role("link", name="下载").click()
    page.get_by_text("电脑登录").click()
    page.locator("#username").click()
    page.locator("#username").fill("")
    page.locator("#showPassword").click()
    page.locator("#showPassword").fill("")
    page.get_by_role("button", name="登录").click()
    time.sleep(3)


def run(playwright: Playwright) -> None:
    executable_path = f'C:\Program Files\Google\Chrome\Application\chrome.exe'
    browser = playwright.chromium.launch(executable_path=executable_path, headless=False)
    # browser = playwright.chromium.launch(headless=False)
    context = browser.new_context()
    page = context.new_page()

    # url = 'https://guide.medlive.cn/guideline/26790'
    # url = 'https://guide.medlive.cn/guideline/26559'
    # url = 'https://guide.medlive.cn/guideline/27086'
    url = 'https://guide.medlive.cn/guideline/3896'
    #i = 'https://guide.medlive.cn/guidelinesub/7885'
    #page.goto("https://guide.medlive.cn/guideline/3896")
    page.goto(url)

    # cookie 注入登录，直接点击下载时需要登录
    if judge_element_exist(page, 'div.pdf_list div[class*="pdf_btn"]>a', 2):
        context.add_cookies([{'name': 'sess', 'value': '', 'domain': 'guide.medlive.cn', 'path': '/', 'expires': -1, 'httpOnly': False, 'secure': False, 'sameSite': 'Lax'}])
        time.sleep(1)
        page.goto(url)
        time.sleep(3)

    # 判断cookie是否失效
    page.reload() # 刷新页面
    icon_login = judge_element_exist(page, "li.icon.login>a", 1)
    icon_user = judge_element_exist(page, "li#get_icon_user_width>a", 1)
    if icon_login or not icon_user:
        # 账号登录
        username_login(page)

    cookies = context.cookies()
    print("cookies", cookies)

    # 下载PDF方式1 #直接点击下载时需要登录
    if judge_element_exist(page, 'div.pdf_list div[class*="pdf_btn"]>a', 2):
        onclick = page.evaluate('(function(){return document.querySelector("#_article_viewer_1 > div > div.pdf_btn > a").getAttribute("onclick")})()')
        down_text = page.evaluate('(function(){return document.querySelector("#_article_viewer_1 > div > div.pdf_btn > a").textContent})()')
        print('onclick:', onclick)
        print('down_text:', down_text)
        if 'download(' in onclick or '下载' in down_text:
            page.get_by_role("link", name="下载").click()
        else:
            print(f'{url} 不存在PDF下载链接')
            return
        time.sleep(0.5)
        # 点击弹窗 方式1
        page.evaluate('document.querySelector("div.tipMask-checkBtn").click()')
        page.evaluate('document.querySelector("div.tipMask-btm.clearfix > div.tipMask-btnNext").click()')
        # 点击弹窗 方式2
        # page.locator(".tipMask-checkBtn").click()
        # page.get_by_text("同意本协议，继续下载").click()

        with page.expect_download() as download_info:
            with page.expect_popup() as page1_info:
                page.get_by_role("link", name="下载").click()
            page1 = page1_info.value
        ### 下载
        download = download_info.value
        page.wait_for_timeout(3000)
        ### 下载保存的路径
        pdf_oss_name = get_md5(f'{url}') + '.pdf'
        print('url地址-----------', url)
        print('pdf_oss_name-----------', pdf_oss_name)
        download.save_as(f'./{pdf_oss_name}')
    else:# 下载PDF方式2(5秒盾)
        return
        data_l = page.evaluate('(function(){return document.querySelector("div.one_info_L>span.icon-card1.full_text_link").getAttribute("data-l")})()')
        data_t = page.evaluate('(function(){return document.querySelector("div.one_info_L>span.icon-card1.full_text_link").getAttribute("data-t")})()')
        print('data_l:', data_l)
        print('data_t:', data_t)
        ### 这是一个跳转url， 需要data_l 和 data_t 这 两个参数
        url_tiao = f'https://guide.medlive.cn/guideline/full_text_link_redirect.php?l={data_l}&t={data_t}'
        print('url_tiao---------', url_tiao)
        pdf_url = get_pdf_down_url(url_tiao)
        print('pdf_url:', pdf_url)
        input('stop')
        try:
            page.goto(pdf_url, timeout=100000, wait_until="domcontentloaded")
        except:pass

        # 点击保存弹窗
        save_file()
        time.sleep(5)
        # 打开浏览器下载页面
        try:
            page.goto('chrome://downloads', timeout=100000, wait_until="domcontentloaded")
        except:pass
        time.sleep(10)

        # 指定保存路径及文件名, 获取下载路径并将文件移到当前目录
        file_name = 'test.pdf'
        pdf_save_path = f'./{file_name}'
        for i in range(5):
            time.sleep(5)
            flug = get_pdf_down_status(page, pdf_url, pdf_save_path)
            if flug:
                break
            page.reload()


with sync_playwright() as playwright:
    run(playwright)