import re
import img2pdf
from selenium import webdriver
import os
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from lxml import etree
class Daoke_88(object):
def __init__(self):
self.url = "https://www.doc88.com/p-3965042236143.html" # 定义要爬取的文档的url
self.chromeOptions = webdriver.ChromeOptions() # 定义对象
self.options = Options()
self.driver = webdriver.Chrome() # 打开谷歌浏览器
self.driver.get(self.url)
def get_page_data(self):
text = self.driver.page_source # 获取页面的审查元素
html = etree.HTML(text) # 它会将text解析为一个HTML文档,并返回一个Element对象,可以通过该对象进行XPath
# 使用xpath去处理
page_num = html.xpath('//*[@id="continue_page"]/div/p[1]/text()')[0]
page_num = int("".join(re.findall(r"\d", page_num, re.S)))
element = WebDriverWait(self.driver, 5).until(
EC.visibility_of_element_located(
(By.XPATH, "//div[@id='continueButton']"))) # 在最长等待5秒的时间内,不断检查页面上id为'continueButton'的div元素是否可见。
element.click() # 对元素进行点击作用
self.load_image_data(page_num)
def load_image_data(self, page_num):
"""
此函数用于下载图片
:return:
"""
js = "return action=document.body.scrollHeight"
# 初始化现在滚动条所在高度为0
height = 0
# 当前窗口总高度
new_height = self.driver.execute_script(js)
k = 0
while k <= page_num:
for i in range(height, new_height, 3000):
k += 1
self.driver.execute_script('window.scrollTo(0, {})'.format(i))
# 每移动一定高度,停顿一秒,等待加载
time.sleep(1)
a = f"download({k}, {k})"
# 中间需要手动点一下运行下载多个文件
self.driver.execute_script("""function download(from, to) {
for (i = from; i <= to; i++) {
const pageCanvas = document.getElementById('page_' + i);
if (pageCanvas === null) break;
pageNo_ = i >= 10 ? ''+i:'0'+i;
const pageNo = pageNo_;
pageCanvas.toBlob(
blob => {
const anchor = document.createElement('a');
anchor.download = 'page_' + pageNo + '.png';
anchor.href = URL.createObjectURL(blob);
anchor.click();
URL.revokeObjectURL(anchor.href);
}
);
}
};
""" + a)
def save_image_data(self):
"""
此函数用于将image图片转化为pdf格式
:return:
"""
filepath = os.getcwd() + '\百度图片'
files = os.listdir(filepath)
print(files)
# 排序,防止合并后文件页面乱序
filedict = {int(i.split('.')[0].split('_')[1]): i for i in files}
print(filedict)
files = [filedict[i] for i in sorted(filedict)]
# # 文件名+路径
files = ['./百度图片/' + i for i in files]
print(files)
# # 把所有图片拼接为pdf
with open('百度图片/testpdf1.pdf', mode='wb') as f:
f.write(img2pdf.convert(files))
def main(self):
# 获取当前的路径,拼接创建一个我们需要指定下载图片的文件夹
path = os.getcwd() + '百度图片'
# 判断文件夹是否存在,不存在创建文件夹
is_exists = os.path.exists(path)
if not is_exists:
os.mkdir(path)
prefs = {"download.default_directory": path}
self.options.add_experimental_option("prefs", prefs)
self.get_page_data()
if __name__ == '__main__':
func = Daoke_88()
func.main()
func.save_image_data()
# 注意这里的路径存在一些问题,使用时请重新调整
selenium爬取道客巴巴文档文件
最新推荐文章于 2024-07-30 11:11:19 发布