下载原力创付费文档—滑动式
一、项目需求:
从目标网址下载付费文档,并保存为PDF形式
网址点这里
二、思路
- 1.利用selenium实现异步加载,获取图片url
- 2.爬取图片
- 3.将图片写进word文档
- 4.将word文档转化成PDF
三、技术点
- 1.python + selenium自动化
- 2.python + docx
- 3.python + pywin32
四、环境
python3.6 + selenium + docx + pywin32
安装(推荐使用清华源):
pip install selenium -i https://pypi.tuna.tsinghua.edu.cn/simple/
pip install python-docx -i https://pypi.tuna.tsinghua.edu.cn/simple/
pip install pywin32 -i https://pypi.tuna.tsinghua.edu.cn/simple/
五、代码
import time
from selenium import webdriver
from selenium.webdriver.common import keys
import requests
from docx import Document
from docx.shared import Inches
from win32com.client import gencache
from win32com.client import constants, gencache
def createword():
# 创建doc文档对象
doc = Document()
# 创建session网络请求对象
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Cookie": "CLIENT_SYS_UN_ID=3rvgCl9XYO1u41DVBzG/Ag==; s_v=cdh%3D%3E27a30245%7C%7C%7Cvid%3D%3E1599561968279953439%7C%7C%7Cfsts%3D%3E1599561968%7C%7C%7Cdsfs%3D%3E0%7C%7C%7Cnps%3D%3E1; s_s=cdh%3D%3E27a30245%7C%7C%7Clast_req%3D%3E1599561968%7C%7C%7Csid%3D%3E1599561968685697441%7C%7C%7Cdsps%3D%3E0; __cfduid=dcce463c0931f0014f9ed1b030e9c47981599561968",
"Host": "view-cache.book118.com",
"Pragma": "no-cache",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36",
}
session = requests.Session()
session.headers = headers
# 创建web驱动对象 自动化
driver = webdriver.Chrome()
driver.implicitly_wait(10)
# 向目标网址发起请求、解析出图片url
driver.get('https://max.book118.com/index.php?g=Home&m=NewView&a=index&aid=8103000063001125&v=20200819')
for i in range(1, 45):
# 一定要两个退格,因为两位数时需要退格两次
driver.find_element_by_xpath('//*[@id="newView"]/div[1]/div/input').send_keys(keys.Keys.BACK_SPACE, keys.Keys.BACK_SPACE, i, keys.Keys.ENTER)
# 等待图片加载时间 10s
time.sleep(10)
# 利用选择器取出src属性值
src = driver.find_element_by_xpath('//div[@class="webpreview-item"][%s]' % i).find_element_by_css_selector('img').get_attribute('src')
# 获取图片
res = session.get(src)
# 主动捕获异常
try:
# 将图片保存
with open('%s.png' % i, 'wb') as f:
f.write(res.content)
# width=Inches(6), height=Inches(8) 将写入的图片设置成A4大小
doc.add_picture('%s.png' % i, width=Inches(6), height=Inches(8))
print('%s.png 写入成功' % i)
except Exception as e:
print('%s.png 写入失败,原因是:%s' % (i, str(e)))
# 保存成word文档,命名为 学术英语.docx
doc.save("学术英语.docx")
driver.quit()
def createpdf(wordPath, pdfPath):
"""
word转pdf
@wordPath: word文件路径
@pdfPath: 生成pdf文件路径
"""
word = gencache.EnsureDispatch('Word.Application')
doc = word.Documents.Open(wordPath, ReadOnly=1)
doc.ExportAsFixedFormat(
pdfPath,
constants.wdExportFormatPDF,
Item=constants.wdExportDocumentWithMarkup,
CreateBookmarks=constants.wdExportCreateHeadingBookmarks
)
word.Quit(constants.wdDoNotSaveChanges)
createword()
createpdf("学术英语.docx", "学术英语.pdf")
^_^有帮助就点个赞吧~~~