# -*- coding: cp936 -*-
import sys,re,zipfile,HTMLParser,os
class GetContent(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self) #HTMLParser不是new class,无法使用super
self.content = ""
def handle_data(self, data):
self.content += data
re_digits = re.compile(r'(\d+)')
def embedded_numbers(s):
pieces = re_digits.split(s)
pieces[1::2] = map(int, pieces[1::2])
return pieces
def sort_with_embedded_numbers(zipinfo_list):
aux = [(embedded_numbers(zipinfo.filename), zipinfo) \
for zipinfo in zipinfo_list]
aux.sort()
return [zipinfo for _, zipinfo in aux]
Files = os.listdir(os.getcwd())
Files = [f for f in Files if ".epub" in f and f.replace(".epub",'.txt') not in Files]
for fname in Files:
fh = zipfile.ZipFile(fname)
html_list = [ zip_info
for zip_info in fh.filelist
if zip_info.filename.endswith("html") or zip_info.filename.endswith("htm")
]
html_list = sort_with_embedded_numbers(html_list)
content_obj = GetContent()
for html in html_list:
content_obj.feed(fh.read(html))
output_filename = fname.replace(".epub", ".txt")
out_fh = open(output_filename, 'w')
out_fh.write(content_obj.content)
out_fh.close()
print fname," done!"
Epub 转 txt
最新推荐文章于 2024-04-23 14:00:00 发布