selenium爬取道客巴巴文档文件

最新推荐文章于 2024-07-30 11:11:19 发布

才不是小emo的小杨

最新推荐文章于 2024-07-30 11:11:19 发布

阅读量474

点赞数

文章标签： selenium python 测试工具

本文链接：https://blog.csdn.net/xiaoyang01234/article/details/132545173

版权

import re
import img2pdf
from selenium import webdriver
import os
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from lxml import etree


class Daoke_88(object):
    def __init__(self):
        self.url = "https://www.doc88.com/p-3965042236143.html"  # 定义要爬取的文档的url
        self.chromeOptions = webdriver.ChromeOptions()  # 定义对象
        self.options = Options()
        self.driver = webdriver.Chrome()  # 打开谷歌浏览器
        self.driver.get(self.url)

    def get_page_data(self):
        text = self.driver.page_source  # 获取页面的审查元素
        html = etree.HTML(text)  # 它会将text解析为一个HTML文档，并返回一个Element对象，可以通过该对象进行XPath
        # 使用xpath去处理
        page_num = html.xpath('//*[@id="continue_page"]/div/p[1]/text()')[0]
        page_num = int("".join(re.findall(r"\d", page_num, re.S)))
        element = WebDriverWait(self.driver, 5).until(
            EC.visibility_of_element_located(
                (By.XPATH, "//div[@id='continueButton']")))  # 在最长等待5秒的时间内，不断检查页面上id为'continueButton'的div元素是否可见。
        element.click() # 对元素进行点击作用
        self.load_image_data(page_num)

    def load_image_data(self, page_num):
        """
        此函数用于下载图片
        :return:
        """
        js = "return action=document.body.scrollHeight"
        # 初始化现在滚动条所在高度为0
        height = 0
        # 当前窗口总高度
        new_height = self.driver.execute_script(js)
        k = 0

        while k <= page_num:
            for i in range(height, new_height, 3000):
                k += 1
                self.driver.execute_script('window.scrollTo(0, {})'.format(i))
                # 每移动一定高度，停顿一秒，等待加载
                time.sleep(1)
                a = f"download({k}, {k})"
                # 中间需要手动点一下运行下载多个文件
                self.driver.execute_script("""function download(from, to) {
                    for (i = from; i <= to; i++) {
                        const pageCanvas = document.getElementById('page_' + i);
                        if (pageCanvas === null) break;
                        pageNo_ = i >= 10 ? ''+i:'0'+i;
                        const pageNo = pageNo_;
                        pageCanvas.toBlob(
                            blob => {
                                const anchor = document.createElement('a');
                                anchor.download = 'page_' + pageNo + '.png';
                                anchor.href = URL.createObjectURL(blob);
                                anchor.click();
                                URL.revokeObjectURL(anchor.href);
                            }
                        );
                    }
                };
                """ + a)

    def save_image_data(self):
        """
        此函数用于将image图片转化为pdf格式
        :return:
        """
        filepath = os.getcwd() + '\百度图片'
        files = os.listdir(filepath)
        print(files)
        # 排序，防止合并后文件页面乱序
        filedict = {int(i.split('.')[0].split('_')[1]): i for i in files}
        print(filedict)
        files = [filedict[i] for i in sorted(filedict)]
        # # 文件名+路径
        files = ['./百度图片/' + i for i in files]
        print(files)
        # # 把所有图片拼接为pdf
        with open('百度图片/testpdf1.pdf', mode='wb') as f:
            f.write(img2pdf.convert(files))

    def main(self):
        # 获取当前的路径，拼接创建一个我们需要指定下载图片的文件夹
        path = os.getcwd() + '百度图片'
        # 判断文件夹是否存在，不存在创建文件夹
        is_exists = os.path.exists(path)
        if not is_exists:
            os.mkdir(path)
        prefs = {"download.default_directory": path}
        self.options.add_experimental_option("prefs", prefs)
        self.get_page_data()


if __name__ == '__main__':
    func = Daoke_88()
    func.main()
    func.save_image_data()
    # 注意这里的路径存在一些问题，使用时请重新调整