批量html转text
(转载请注明来源于金庆的专栏)
原来的代码是参考“Recipe 12.11. Using MSHTML to Parse XML or HTML”,利用htmlfile提取文本。
将当前目录下的所有html文件转换为text文件。
但是发现MSHTML解析文件可能会出错,造成文本提取失败。
jigloo经过对10W+个html文件的测试,得出结论,htmlfile的容错比InternetExplorer.Application要差很多。
原文见:http://groups.google.com/group/python-cn/msg/c9221764bcafbc21
他的代码大致如下,IE使用稍烦:
对于我的简单任务,这就足够了。
有一个问题,如果有资源管理器打开着,运行这段代码会关闭资源管理器,并出错退出。比较奇怪,但应该不难解决,可能是IE控件的使用上还有问题。
self.__ie.Document.close()
File "C:/Python25/Lib/site-packages/win32com/client/dynamic.py", line 496, in
__getattr__
raise AttributeError, "%s.%s" % (self._username_, attr)
AttributeError: Document.close
(转载请注明来源于金庆的专栏)
原来的代码是参考“Recipe 12.11. Using MSHTML to Parse XML or HTML”,利用htmlfile提取文本。
将当前目录下的所有html文件转换为text文件。
def
extractHtmlFile(htmlFilePath):
''' Extract html text and save to text file.
'''
htmlData = file(htmlFilePath, ' r ' ).read()
import win32com.client
html = win32com.client.Dispatch( ' htmlfile ' )
html.writeln(htmlData)
text = html.body.innerText.encode( ' gbk ' , ' ignore ' )
...
''' Extract html text and save to text file.
'''
htmlData = file(htmlFilePath, ' r ' ).read()
import win32com.client
html = win32com.client.Dispatch( ' htmlfile ' )
html.writeln(htmlData)
text = html.body.innerText.encode( ' gbk ' , ' ignore ' )
...
但是发现MSHTML解析文件可能会出错,造成文本提取失败。
jigloo经过对10W+个html文件的测试,得出结论,htmlfile的容错比InternetExplorer.Application要差很多。
原文见:http://groups.google.com/group/python-cn/msg/c9221764bcafbc21
他的代码大致如下,IE使用稍烦:
#
!/usr/bin/env python
import sys, os, re, codecs
import time
import win32com.client
class htmlfile:
def __init__ (self):
self. __ie = win32com.client.Dispatch( ' InternetExplorer.Application ' )
self. __ie .Silent = True
self. __filename = ''
self. __document = None
def __del__ (self):
self. __ie .Quit()
def __getdocument (self, filename):
filename = os.path.abspath(filename)
if self. __filename != filename:
self. __filename = filename
self. __ie .Navigate2(filename)
self. __ie .Document.close()
while self. __ie .Document.Body is None:
time.sleep( 0.1 )
self. __document = self. __ie .Document
return self. __document
def gettext(self, filename):
return self. __getdocument (filename).Body.innerText
def gettitle(self, filename):
return self. __getdocument (filename).title
def formattextpath(dir, htmlfile, htmltitle):
''' Format the text file path and return.
'''
fname = htmltitle[: 6 ]
fname = re.sub(r ' (/|/|:|*|?|<|>|||") ' , ' - ' , fname)
fname = fname + ' _ ' + os.path.splitext(htmlfile)[0] + ' .txt '
return os.path.join(root, fname)
if __name__ == ' __main__ ' :
hf = htmlfile()
for root, dirs, names in os.walk(u ' . ' ):
for name in names:
if name.endswith( ' htm ' ) or name.endswith( ' html ' ):
htmlpath = os.path.join(root, name)
textpath = formattextpath(root, name, hf.gettitle(htmlpath))
print htmlpath, ' -> ' , textpath
file(textpath, ' wb ' ).write(hf.gettext(htmlpath).encode( ' mbcs ' ))
# End of if.
# End of for name.
# End of for root.
del hf
# End of if.
import sys, os, re, codecs
import time
import win32com.client
class htmlfile:
def __init__ (self):
self. __ie = win32com.client.Dispatch( ' InternetExplorer.Application ' )
self. __ie .Silent = True
self. __filename = ''
self. __document = None
def __del__ (self):
self. __ie .Quit()
def __getdocument (self, filename):
filename = os.path.abspath(filename)
if self. __filename != filename:
self. __filename = filename
self. __ie .Navigate2(filename)
self. __ie .Document.close()
while self. __ie .Document.Body is None:
time.sleep( 0.1 )
self. __document = self. __ie .Document
return self. __document
def gettext(self, filename):
return self. __getdocument (filename).Body.innerText
def gettitle(self, filename):
return self. __getdocument (filename).title
def formattextpath(dir, htmlfile, htmltitle):
''' Format the text file path and return.
'''
fname = htmltitle[: 6 ]
fname = re.sub(r ' (/|/|:|*|?|<|>|||") ' , ' - ' , fname)
fname = fname + ' _ ' + os.path.splitext(htmlfile)[0] + ' .txt '
return os.path.join(root, fname)
if __name__ == ' __main__ ' :
hf = htmlfile()
for root, dirs, names in os.walk(u ' . ' ):
for name in names:
if name.endswith( ' htm ' ) or name.endswith( ' html ' ):
htmlpath = os.path.join(root, name)
textpath = formattextpath(root, name, hf.gettitle(htmlpath))
print htmlpath, ' -> ' , textpath
file(textpath, ' wb ' ).write(hf.gettext(htmlpath).encode( ' mbcs ' ))
# End of if.
# End of for name.
# End of for root.
del hf
# End of if.
对于我的简单任务,这就足够了。
有一个问题,如果有资源管理器打开着,运行这段代码会关闭资源管理器,并出错退出。比较奇怪,但应该不难解决,可能是IE控件的使用上还有问题。
self.__ie.Document.close()
File "C:/Python25/Lib/site-packages/win32com/client/dynamic.py", line 496, in
__getattr__
raise AttributeError, "%s.%s" % (self._username_, attr)
AttributeError: Document.close