import requests
from selenium import webdriver
import time
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'}
url = 'https://xxxxxxxxx.com/xxxx/xxx/01126.shtm'
driver = webdriver.Chrome()
driver.get(url)
time.sleep(4)
driver.maximize_window()
path = r"E:\honyuping\c"
list1=[]
for i in range(100): # 预览下滑,这里网速比较慢,所以要提前下滑加载,而且这里只要加载完全部图片链接都会出来,不会隐藏掉,这里没有做停止判断处理
try:
for d in range(15):
print(d)
driver.execute_script("window.scrollBy(0,1000)")
time.sleep(10)
pictureurl = driver.find_elements_by_css_selector("[class='webpreview-item']")
for p in pictureurl:
urlp = p.find_element_by_tag_name("img").get_attribute('src')
print(urlp)
if urlp not in list1:
list1.append(urlp)
except:
pass
print(len(list1))
print(list1)
for s in list1:
r = requests.get(s, headers=headers, timeout=5)
# 以时间轴的形式给图片命名
f = open(f'{path}/{int(time.time() * 1000)}.png', 'wb')
# 写入文件夹
f.write(r.content)
time.sleep(1)
f.close()
图片转pdf
import glob
import fitz
import os
def c_pdf():
doc = fitz.open()
for img in sorted(glob.glob("E:/honyuping/c/*")): # 按文件名排序读取图片
print(img)
imgdoc = fitz.open(img)
pdfbytes = imgdoc.convertToPDF() # 使用图片创建单页的 PDF
imgpdf = fitz.open("pdf", pdfbytes)
doc.insertPDF(imgpdf)
if os.path.exists("c.pdf"):
os.remove("c.pdf")
doc.save("c.pdf")
doc.close()
if __name__ == '__main__':
c_pdf()