1 from formatter import AbstractFormatter, NullWriter 2 from htmllib import HTMLParser 3 4 def _(str, in_encoder="gbk", out_encoder="utf8"): 5 return unicode(str, in_encoder).encode(out_encoder) 6 7 8 class myWriter(NullWriter): 9 def __init__(self): 10 NullWriter.__init__(self) 11 self._bodyText = [] 12 13 def send_flowing_data(self, str): 14 self._bodyText.append(str) 15 16 def _get_bodyText(self): 17 return '/n'.join(self._bodyText) 18 19 bodyText = property(_get_bodyText, None, None, 'plain text from body') 20 21 class myHTMLParser(HTMLParser): 22 def do_meta(self, attrs): 23 self.metas = attrs 24 25 def convertFile(filename): 26 mywriter = myWriter() 27 absformatter = AbstractFormatter(mywriter) 28 parser = myHTMLParser(absformatter) 29 parser.feed(open(filename).read()) 30 return ( _(parser.title), parser.formatter.writer.bodyText ) 31 32 import os 33 import os.path 34 35 OUTPUTDIR = "./txt" 36 INPUTDIR = "." 37 if __name__ == "__main__": 38 if not os.path.exists(OUTPUTDIR): 39 os.mkdir(OUTPUTDIR) 40 41 for file in os.listdir(INPUTDIR): 42 if file[-4:] == '.htm': 43 print "Coverting", file, 44 outfilename, text = convertFile(file) 45 outfilename = outfilename + '.txt' 46 outfullname = os.path.join(OUTPUTDIR, outfilename) 47 open(outfullname, "wt").write(text) 48 print "Done!" 49