使用python制作epub

前期工作

  1. ebookLib库
    1. 关于该库,如果pypi版本太低,需要去gitlab上clone,然后运行python setup.py install
  2. zhconv库,主要用来简繁转换
  3. 一个允许抓取数据的小说网站novel-backup
  4. 一点点时间

开始制作

第一步、分析网站

根据自己抓取的网站,获取所有章节的链接
https://novels.novel-backup.cf/index/1558018541.json

根据获得的内容,对内容进行分析
(已省略部分数据)

[
  {
    "name": "41.成套的葡萄酒杯",
    "id": 7460
  },
  {
    "name": "42.烤肉午餐",
    "id": 7550
  }
]

里面的id就是下面章节内容的链接xx/yy.json的yy

再获取章节内容,对其内容进行分析
https://novels.novel-backup.cf/novels/93065.json
(已省略部分数据)

{
  "code_id": 1558018541,
  "title": "第1卷插圖",
  "create_date": "2020-10-07 20:51:33",
  "content": "<p><img src=\"https://live.staticflickr.com/65535/50431755246_afecb655fc_o.png[/img][/url][url=https://flic.kr/p/2jQtVPu]魔導具師ダリヤはうつむかない 1-0[/url] by [url=https://www.flickr.com/photos/55799173@N00/]jameslam518[/url], on Flickr\"  class=\"fr-fic fr-dib\"></p><p><br></p>",
  "author": "職業量地官",
  "views": 2896
}

对于我们来说,有用的是title、content、author

第二步、抓取数据并清洗

ebookLib的章节顺序是按照add_item来排的,所以我们需要对抓取的章节进行排序。
首先新建一个py文件,然后新建一个类Espider

    def getJson(self,url):
        html:requests.Response= requests.get(url)
        return html.json()
    def getDictList(self,url):
        js:typing.List[dict]=self.getJson(url)
        return js
    def getFilter(self,li_list):
        maxx=0
        id_dicts=[]
        for li in li_list:
            idict=li
            idict['name']=convert(idict['name'],'zh-hans')
            ll=re.findall(r'([1-9]\d*.\d*|0\.\d*[1-9]\d*)',idict['name'])
            if(len(ll)>0):
                s:str=ll[0]
                num=int(s[:-1])
                idict['num']=num
                maxx=max(maxx,num)
            else:
                ll=re.findall(r'第([1-9]\d*)话',idict['name'])
                if(len(ll)>0):
                    s:str=ll[0]
                    num=int(s)
                    idict['num']=num
                    maxx=max(num,maxx)
                else:
                    maxx+=1
                    idict['num']=maxx
            id_dicts.append(idict)
        id_dicts.sort(key=lambda it:it['num'])
        tmp_list:typing.List[dict]=[]
        for i in range(len(id_dicts)):
            id_dicts[i]['i']=str(i)
            tmp_list.append(id_dicts[i])
        return tmp_list

首先是获取数据,然后将数据转换格式(getJson,getDictList)
getFilter长长的代码简单理解就是将章节的链接List中每个174. 疲勞與真心話第3話 商業公會前面的数字取出来,然后如果有不存在数字的章节,就让这个章节的id=maxx

获取文章内容

对于图片需要特殊处理,先保存到本地后再添加到epub文件里

    def getDict(self,url):
        js:dict=self.getJson(url)
        return js
    def saveImg(self,title,src):
        path='Images/{}'.format(title)
        if(os.path.exists(path)==False):
            os.mkdir(path)
        s=re.findall(r'65535/(.*?)\[/img\]',src)
        if(len(s)==0):
            s=re.findall(r'65535/(.*?.png)',src)[0]
        else:
            s=s[0]
        res:requests.Response=requests.get(src,stream=True)
        res.raise_for_status()
        with open("{}/{}".format(path,s),"wb") as f:
            f.write(res.content)
        self.img_list.append({
            'src':"{}/{}".format(path,s),
            'uid':s.split('.')[0]
        })
        return "{}/{}".format(path,s)
    
    def contentCheck(self,title,content:str):
        soup=BeautifulSoup(content,'lxml')
        for img in soup.findAll('img'):
            s=self.saveImg(title,img['src'])
            img['src']=s
        return str(soup.body)
    
    def getContent(self,id):
        url_s='https://novels.novel-backup.cf/novels/'
        url_e='.json'
        print(url_s+id+url_e)
        js=self.getDict(url_s+id+url_e)
        js['author']=convert(js['author'],'zh-hans')
        js['title']=convert(js['title'],'zh-hans')
        js['content']=convert(js['content'],'zh-hans')
        return '<p>搬运:'+js['author']+'</p>'+self.contentCheck(js['title'],js['content'])

getDict是获取数据,然后getContentjs取出属性,使用contentCheck对内容处理,之后将其保存到List中。

第三步、保存到Epub中

新建一个ebook文件,将eooklib和Espider引入


toc = []
spine = ['nav']
book = epub.EpubBook()

chp_list = []


def init(title, author):
    # set metadata
    book.set_identifier('id123456')
    book.set_title(title)
    book.set_language('cn')
    book.add_author(author)
    book.add_author('Anonymous', file_as='Anonymous',
                    role='ill', uid='coauthor')

    # add default NCX and Nav file
    book.add_item(epub.EpubNcx())
    book.add_item(epub.EpubNav())

    # define CSS style
    style = 'pre{white-space:pre-wrap;background:#f7f9fa;padding:10px 15px;color:#263238;line-height:1.6;font-size:13px;border-radius:3px margin-top: 0;margin-bottom:1em;overflow:auto}b,strong{font-weight:bolder}#title{font-size:16px;color:#212121;font-weight:600;margin-bottom:10px}hr{height:10px;border:0;box-shadow:0 10px 10px -10px #8c8b8b inset}'
    nav_css = epub.EpubItem(
        uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)

    # add CSS file
    book.add_item(nav_css)


def saveChapter():
    c1=getChapter(
        '前言', '<p>使用python ebooklib整合,数据来源https://novel-backup.cf/,仅供参考请勿商用</p>', '000')
    book.add_item(c1)
    toc.append(epub.Link(c1.file_name,c1.title,c1.title))
    spine.append(c1)
    for it in chp_list:
        # For each chapter add chapter to the book, TOC and spine
        book.add_item(it['chapter'])
        toc.append(epub.Link(it['chapter'].file_name,
                   it['chapter'].title, it['chapter'].title))
        spine.append(it['chapter'])


def saveImage(img_list:typing.List[dict]):
    for img in img_list:
        image_content = open(
            img['src'], 'rb').read()
        img = epub.EpubImage(uid=img['uid'], file_name=img['src'],
                            media_type='image/png', content=image_content)
        book.add_item(img)


def saveEpub(file_name):
    # define Table Of Contents
    book.toc = tuple(toc)
    # basic spine
    book.spine = spine
    # write to the file
    epub.write_epub('epub/'+file_name, book, {})


def getChapter(title, content, id):
    c1 = epub.EpubHtml(title=title,
                       file_name='chap_'+id+'.xhtml', lang='hr')
    c1.content = '<h1>'+title+'<h1>'+content
    return c1


def poChapter(it, llen):
    i = int(it['i'])+1
    c = getChapter(it['name'], es.getContent(
        str(it['id'])), str(i).zfill(llen))
    chp_list.append({
        'chapter': c,
        'id': i
    })


if __name__ == '__main__':
    init('魔導具師妲莉雅不會低頭 ~從今天開始自由的職人生活~', '自动叉积·整合')
    es = Espider()
    li_url = 'https://novels.novel-backup.cf/index/1558018541.json'
    li_list = es.getDictList(li_url)
    id_dicts = es.getFilter(li_list)
    llen = len(str(len(id_dicts)))
    # poChapter(id_dicts[0],llen)

    # 创建线程
    index = [i for i in range(0, len(id_dicts), 4)]
    threads = []
    for i in index:
        for j in range(0, 4):
            threads.append(threading.Thread(
                target=poChapter, args=(id_dicts[i+j], llen)))
    for t in threads:
        t.start()
    for t in threads:
        t.join()
    print('Main thread has ended!')
    chp_list.sort(key=lambda it: it['id'])
    saveChapter()
    saveImage(es.img_list)
    saveEpub('《魔導具師妲莉雅不會低頭 ~從今天開始自由的職人生活~》.epub')

init来自ebookLib官方文档给出的函数,str(i).zfill(llen)是对数字进行数位补齐,如’chap_002.xhtml’
引入threading是为了在爬虫的时候进行多线程,提高效率。

全部代码

# spider.py
import requests
from bs4 import BeautifulSoup
import typing
import re
import os
from zhconv import convert

class Espider:
# https://novels.novel-backup.cf/index/1558018541.json

# https://novels.novel-backup.cf/novels/7460.json
    img_list=[]
    def getJson(self,url):
        html:requests.Response= requests.get(url)
        # soup = BeautifulSoup(html.json())
        return html.json()

    def getDict(self,url):
        js:dict=self.getJson(url)
        # print(js)
        return js

    def getDictList(self,url):
        js:typing.List[dict]=self.getJson(url)
        # print(js)
        return js

    def saveImg(self,title,src):
        path='Images/{}'.format(title)
        if(os.path.exists(path)==False):
            os.mkdir(path)
        # print(src)
        s=re.findall(r'65535/(.*?)\[/img\]',src)
        # print(s)
        if(len(s)==0):
            s=re.findall(r'65535/(.*?.png)',src)[0]
        else:
            s=s[0]
        # print(s)
        res:requests.Response=requests.get(src,stream=True)
        res.raise_for_status()
        with open("{}/{}".format(path,s),"wb") as f:
            f.write(res.content)
        self.img_list.append({
            'src':"{}/{}".format(path,s),
            'uid':s.split('.')[0]
        })
        return "{}/{}".format(path,s)
    
    def contentCheck(self,title,content:str):
        soup=BeautifulSoup(content,'lxml')
        # print(soup)
        for img in soup.findAll('img'):
            s=self.saveImg(title,img['src'])
            img['src']=s
        # ''.join(str(it) for it in soup.find_all('p'))
        return str(soup.body)
    
    def getContent(self,id):
        url_s='https://novels.novel-backup.cf/novels/'
        url_e='.json'
        print(url_s+id+url_e)
        js=self.getDict(url_s+id+url_e)
        js['author']=convert(js['author'],'zh-hans')
        js['title']=convert(js['title'],'zh-hans')
        js['content']=convert(js['content'],'zh-hans')
        # print(js['author'],js['title'],js['content'])
        return '<p>搬运:'+js['author']+'</p>'+self.contentCheck(js['title'],js['content'])

    def getFilter(self,li_list):
        maxx=0
        id_dicts=[]
        for li in li_list:
            idict=li
            idict['name']=convert(idict['name'],'zh-hans')
            ll=re.findall(r'([1-9]\d*.\d*|0\.\d*[1-9]\d*)',idict['name'])
            if(len(ll)>0):
                s:str=ll[0]
                num=int(s[:-1])
                idict['num']=num
                maxx=max(maxx,num)
            else:
                ll=re.findall(r'第([1-9]\d*)话',idict['name'])
                if(len(ll)>0):
                    s:str=ll[0]
                    num=int(s)
                    idict['num']=num
                    maxx=max(num,maxx)
                else:
                    maxx+=1
                    idict['num']=maxx
            id_dicts.append(idict)
        id_dicts.sort(key=lambda it:it['num'])
        tmp_list:typing.List[dict]=[]
        for i in range(len(id_dicts)):
            id_dicts[i]['i']=str(i)
            tmp_list.append(id_dicts[i])
        return tmp_list
    
    def getIdList(self,li_list):
        id_list:typing.List[str]=[str(it['id']) for it in li_list]
        return id_list
    
if __name__=="__main__":
    print("爬取开始")
    # po=pool.Pool(5)
    # li_url='https://novels.novel-backup.cf/index/1558018541.json'
    es=Espider()
    # li_list=es.getDictList(li_url);
    # # print(li_list)
    # id_dicts=es.getFilter(li_list)
    # print(id_dicts)
    print(es.getContent('112353'))
    print(es.getContent('16733'))
    # print(es.img_list)
    print('爬取结束')

# ebook.py
import threading
import typing
from ebooklib import epub
from spider import Espider

toc = []
spine = ['nav']
book = epub.EpubBook()

chp_list = []


def init(title, author):
    # set metadata
    book.set_identifier('id123456')
    book.set_title(title)
    book.set_language('cn')
    book.add_author(author)
    book.add_author('Anonymous', file_as='Anonymous',
                    role='ill', uid='coauthor')

    # add default NCX and Nav file
    book.add_item(epub.EpubNcx())
    book.add_item(epub.EpubNav())

    # define CSS style
    style = 'pre{white-space:pre-wrap;background:#f7f9fa;padding:10px 15px;color:#263238;line-height:1.6;font-size:13px;border-radius:3px margin-top: 0;margin-bottom:1em;overflow:auto}b,strong{font-weight:bolder}#title{font-size:16px;color:#212121;font-weight:600;margin-bottom:10px}hr{height:10px;border:0;box-shadow:0 10px 10px -10px #8c8b8b inset}'
    nav_css = epub.EpubItem(
        uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)

    # add CSS file
    book.add_item(nav_css)


def saveChapter():
    c1=getChapter(
        '前言', '<p>使用python ebooklib整合,数据来源https://novel-backup.cf/,仅供参考请勿商用</p>', '000')
    book.add_item(c1)
    toc.append(epub.Link(c1.file_name,c1.title,c1.title))
    spine.append(c1)
    for it in chp_list:
        # For each chapter add chapter to the book, TOC and spine
        book.add_item(it['chapter'])
        toc.append(epub.Link(it['chapter'].file_name,
                   it['chapter'].title, it['chapter'].title))
        spine.append(it['chapter'])

    # print('save c', chapter.file_name)


def saveImage(img_list:typing.List[dict]):
    for img in img_list:
        image_content = open(
            img['src'], 'rb').read()
        img = epub.EpubImage(uid=img['uid'], file_name=img['src'],
                            media_type='image/png', content=image_content)
        book.add_item(img)


def saveEpub(file_name):
    # define Table Of Contents
    book.toc = tuple(toc)
    # basic spine
    book.spine = spine
    # write to the file
    epub.write_epub('epub/'+file_name, book, {})


def getChapter(title, content, id):
    c1 = epub.EpubHtml(title=title,
                       file_name='chap_'+id+'.xhtml', lang='hr')
    c1.content = '<h1>'+title+'<h1>'+content

    print("g", c1.file_name, c1.title, id)
    return c1


def poChapter(it, llen):
    # print("开始进程", it['i'])
    i = int(it['i'])+1
    c = getChapter(it['name'], es.getContent(
        str(it['id'])), str(i).zfill(llen))
    chp_list.append({
        'chapter': c,
        'id': i
    })
    # saveChapter(c, it['i'])


if __name__ == '__main__':
    init('魔導具師妲莉雅不會低頭 ~從今天開始自由的職人生活~', '自动叉积·整合')
    es = Espider()
    li_url = 'https://novels.novel-backup.cf/index/1558018541.json'
    li_list = es.getDictList(li_url)
    id_dicts = es.getFilter(li_list)
    llen = len(str(len(id_dicts)))
    # poChapter(id_dicts[0],llen)

    # 创建线程
    index = [i for i in range(0, len(id_dicts), 4)]
    threads = []
    for i in index:
        for j in range(0, 4):
            threads.append(threading.Thread(
                target=poChapter, args=(id_dicts[i+j], llen)))
    for t in threads:
        t.start()
    for t in threads:
        t.join()
    print('Main thread has ended!')
    chp_list.sort(key=lambda it: it['id'])
    saveChapter()
    # es.img_list.append('Images/第6卷插圖/51154283631_826ee93727_o.png')
    saveImage(es.img_list)
    saveEpub('《魔導具師妲莉雅不會低頭 ~從今天開始自由的職人生活~》.epub')

  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值