book118可预览文档下载

最新推荐文章于 2024-08-13 10:56:42 发布

qq_39972887

最新推荐文章于 2024-08-13 10:56:42 发布

阅读量7.8k

点赞数

文章标签： python http

本文链接：https://blog.csdn.net/qq_39972887/article/details/107849705

版权

book118可预览文档下载@TOC

book118可预览文档下载

最早从csdn上搜到的book118可预览文档下载方法是js代码，很简短的一段，然后在开发者工具里使用。按步骤一回车即可下载。

function download(url, fileName) {
    let xhr = new XMLHttpRequest();
    xhr.open('GET', url, true);//true表示异步
    xhr.responseType = 'blob';
    xhr.onload = () => {
        if (xhr.status === 200) {
           downloadByA(xhr.response,fileName);
        }
    };
    xhr.send();
}
function downloadByA(data,fileName){
	let urlObject = window.URL || window.webkitURL || window;
	let export_blob=new Blob([data]);
	let a=document.createElement("a");
	a.href=urlObject.createObjectURL(export_blob);
	a.download=fileName;
	a.click();
}
//下面这块代码需要按自己需求，进行稍微地修改，上面两块代码可以不用动
document.querySelectorAll(".pageBox img").forEach(function(ele, i) {
    download(ele.src,i+".jpg");
});

原文链接在这里文章链接

但是使用以后发现有问题，文件页数一多就出现缺页情况。
再次搜索发现了python实现的代码。
Py3 下载book118预览图片并合并成docx文件文章链接
就拷贝测试了一下，发现作者的源码有问题，然后进行了部分更正，和格式调整。
中间除了点list和函数间调用时的bug。
然后增加了用pymupdf 模块生产pdf文件的步骤，毕竟pdf更方便的多，反正下载的也是不可编辑的png图片。

#!/usr/bin/env python
# coding: utf-8

# In[1]:


# -*- coding: utf-8 -*-


# In[2]:


import requests


# In[3]:


import json


# In[4]:


import re


# In[5]:


import time


# In[6]:


import os


# In[7]:


import sys


# In[8]:


import lxml


# In[9]:


from bs4 import BeautifulSoup


# In[10]:


from docx import Document


# In[11]:


from docx.shared import Cm


# In[12]:


import glob


# In[13]:


import fitz


# In[14]:


headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'}
    


# In[15]:


mylist=[]


# In[16]:


#定义保存地址


# In[17]:


def path1():
    path1=input('please input path:')
    if len(path1)< 2:
        path1 = 'C:\\tmp'
    if os.path.exists(path1) == False:
        os.makedirs(path1)

    mylist.append(path1)
    


# In[18]:


#len(path1)


# In[19]:


#获取真实预览地址turl
def turl():
    url = input('please input url:')
    if len(url)<10:
        url = 'https://max.book118.com/html/2017/0323/96553980.shtm'
    aid = re.search('\d{1,100}\.s',url).group()[:-2]
    rep = requests.get(url, headers=headers)
    soup = BeautifulSoup(rep.content, 'lxml', from_encoding='utf-8')
    title = soup.title.contents[0]
    title = title[:title.find('.')]
    #time.sleep(2)
    turl = 'https://max.book118.com/index.php?g=Home&m=NewView&a=index&aid={}'.format(aid)
    #print('turl:',turl)
    mylist.append(turl)
    mylist.append(title)
    #print(mylist)
    

    


# In[20]:


def bookinfo():
    
    turl()
    #获取预览信息，放在book中
    rep2 = requests.get(mylist[1], headers=headers)
    #time.sleep(8)
    if '验证' in rep2.text:
        
        print('need verify')
        print(rep2.text)
    else:
        bs = BeautifulSoup(rep2.content,'lxml', from_encoding='utf-8')
        for i in bs.find_all('script'):
            js = i.string
            if js is not None and 'PREVIEW_PAGE' in js:
                p1 = re.compile(".+?'(.+?)'")
                js_line = js.splitlines(1)
                book = {
                    'pageAll':p1.findall(js_line[1])[0],
                    'pagePre':p1.findall(js_line[1])[1],
                    'aid':p1.findall(js_line[7])[0],
                    'viewToken':p1.findall(js_line[7])[1],
                    'title':mylist[2]
                    }
                print("ok")     
            else:
    
                print("")
    
    print('book:',book)
    mylist.append(book)
     


# In[21]:


def prepage_url():
    
    #利用book中的信息获取预览图片地址，放在url_dict中
    page = {
        'max':int(mylist[3]['pageAll']),
        'pre':int(mylist[3]['pagePre']),
        'num':1,
        'repeat':0
        }  
    #设置循环，尽量获取全部预览图片的地址
    url3_dict = {}
    while page['num'] < page['pre']:        
        url3 = 'https://openapi.book118.com/getPreview.html' 
        playload = {
            'project_id': 1,
            'aid': mylist[3]['aid'],  
            'view_token': mylist[3]['viewToken'], 
            'page': page['num']
            }
        rep3 = requests.get(url3, params=playload, headers=headers)    
        rep3_dict = json.loads(rep3.text[12:-2])    
        if rep3_dict['data'][str(page['num'])]:
            
            url3_dict.update(rep3_dict['data'])
            page['num'] = page['num'] + 6
            time.sleep(3)
            page['repeat'] = 0
        else:
            if page['repeat'] > 6:
                sys.stdout.write('\r{0}'.format(str(page['num']) + " : Repeat too much.\n !get nothing, sleep 5 second."))
                sys.stdout.flush()
                time.sleep(3)
            else:
                sys.stdout.write('\r{0}'.format(str(page['num']) + " : !get nothing, sleep 2 second."))
                sys.stdout.flush()
                time.sleep(3)
            page['repeat'] = page['repeat'] + 1
        
    print(url3_dict) 
    mylist.append(url3_dict)
    


# In[22]:


def download():
    
    #指定文件夹path
    path=mylist[0] + '/{}'.format(mylist[2])
    
    if os.path.exists(path) == False:
        os.makedirs(path)
    
    #下载预览图片到path，并合并到docx文件
    myDocx = Document()
    for section in myDocx.sections:
        
        section.page_width = Cm(21)
        section.page_height = Cm(29.7)
        section.left_margin = section.right_margin = section.top_margin = section.bottom_margin = Cm(0)
    for item in mylist[4]:
        
        try:
            num = 'Page{:0>3}'.format(item)
            url_item=mylist[4][item]
            url_item=url_item[url_item.index('view'):]
            url = 'http://' + url_item#根据url_dict内url的完整度进行调整
            print('url:',url,';')
            rep = requests.get(url, headers=headers)
            img_filename = path + '/{}.png'.format(num)
            with open(img_filename, 'wb') as img:
                img.write(rep.content)
            print('Saved locally img_filename:',img_filename)
            myDocx.add_picture(img_filename, width=Cm(21))
        except:
            print('{} download wrong'.format(item))
    
    myDocx.save('{}.docx'.format(mylist[2]))
    mylist.append(path)
    return
    


# In[23]:


def pic2pdf():
    doc = fitz.open( )
    path=mylist[5]+"/*"
    #print(mylist[5])
    a1=len(mylist[0]) + 1
    name=mylist[5][a1:]+".pdf"
    print(name)
    for img in sorted(glob.glob(path)):
        assert isinstance(img, object)
        #print(img)
        imgdoc = fitz.open(img)
        pdfbytes = imgdoc.convertToPDF()
        imgpdf = fitz.open("pdf", pdfbytes)
        doc.insertPDF(imgpdf)
    if os.path.exists(name):
        os.remove(name)
    doc.save(name)
    doc.close()


# In[24]:


def main():    
    path1()
    bookinfo()
    prepage_url()
    download()
    print('all done')
    number = input('是否需要输出pdf:（是打1，否打其他）')
    if int(number) == 1:
        pic2pdf()
        print("pdf's done")
    else:
        print('88')
        
    
    


# In[25]:


if __name__ == '__main__':
    
    main()

用到的模块包括
requests
json
re
time
os
sys
lxml
bs4
python-docx
pymupdf

为了好看，把代码分成了五个函数。其实不做成函数也一样用，还更简单些，不会出莫名其妙的麻烦。

图片存放地址是path1（默认定义的是c盘tmp目录）下，会根据下载的文档名称生成新的子目录。
下载的doc和pdf文件会在程序本地。
ps：1.windows下""在路径里会出错。
2.{}。format（）是一种字符串格式化的方法。
3.跨函数调用的时候，return值会缺少某些属性
4.python在识别路径的时候对\和/都很友好。

下一步加一个界面。
用pyinstaller 打包的时候跑到另一个xp的机器上测试，结果发现了一个错误
pil/_imaging.cp38-win32.pyd文件找不到。
因为其他电脑都装了python，暂时还没去处理这个问题。回头再继续debug。

测试了一个195页的文件，没有出现错误。