tags
- convert html entities (&name; and ref;) to Ascii.
'''
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.CSV = '' # The CSV data
self.CSVrow = '' # The current CSV row beeing constructed from HTML
self.inTD = 0 # Used to track if we are inside or outside a ... tag.
self.inTR = 0 # Used to track if we are inside or outside a
... tag.
self.re_multiplespaces = re.compile('\s+') # regular expression used to remove spaces in excess
self.rowCount = 0 # CSV output line counter.
def handle_starttag(self, tag, attrs):
if tag == 'tr': self.start_tr()
elif tag == 'td': self.start_td()
def handle_endtag(self, tag):
if tag == 'tr': self.end_tr()
elif tag == 'td': self.end_td()
def start_tr(self):
if self.inTR: self.end_tr() #
implies
self.inTR = 1
def end_tr(self):
if self.inTD: self.end_td() # implies
self.inTR = 0
if len(self.CSVrow) > 0:
self.CSV += self.CSVrow[:-1]
self.CSVrow = ''
self.CSV += '\n'
self.rowCount += 1
def start_td(self):
if not self.inTR: self.start_tr() # implies
self.CSVrow += '"'
self.inTD = 1
def end_td(self):
if self.inTD:
self.CSVrow += '",'
self.inTD = 0
def handle_data(self, data):
if self.inTD:
self.CSVrow += self.re_multiplespaces.sub(' ',data.replace('\t',' ').replace('\n','').replace('\r','').replace('"','""'))
def getCSV(self,purge=False):
''' Get output CSV.
If purge is true, getCSV() will return all remaining data,
even if or
are not properly closed.
(You would typically call getCSV with purge=True when you do not have
any more HTML to feed and you suspect dirty HTML (unclosed tags). '''
if purge and self.inTR: self.end_tr() # This will also end_td and append last CSV row to output CSV.
dataout = self.CSV[:]
self.CSV = ''
return dataout
if __name__ == "__main__":
try: # Put getopt in place for future usage.
opts, args = getopt.getopt(sys.argv[1:],None)
except getopt.GetoptError:
print usage(sys.argv[0]) # print help information and exit:
sys.exit(2)
if len(args) == 0:
print usage(sys.argv[0]) # print help information and exit:
sys.exit(2)
print programname
html_files = glob.glob(args[0])
for htmlfilename in html_files:
outputfilename = os.path.splitext(htmlfilename)[0]+'.csv'
parser = html2csv()
print 'Reading %s, writing %s...' % (htmlfilename, outputfilename)
try:
htmlfile = open(htmlfilename, 'rb')
csvfile = open( outputfilename, 'w+b')
data = htmlfile.read(8192)
while data:
parser.feed( data )
csvfile.write( parser.getCSV() )
sys.stdout.write('%d CSV rows written.\r' % parser.rowCount)
data = htmlfile.read(8192)
csvfile.write( parser.getCSV(True) )
csvfile.close()
htmlfile.close()
except:
print 'Error converting %s ' % htmlfilename
try: htmlfile.close()
except: pass
try: csvfile.close()
except: pass
print 'All done. '
希望本文所述对大家的Python程序设计有所帮助。
声明:本文原创发布php中文网,转载请注明出处,感谢您的尊重!如有疑问,请联系admin@php.cn处理
相关文章
相关视频
网友评论
文明上网理性发言,请遵守 新闻评论服务协议我要评论
立即提交
专题推荐独孤九贱-php全栈开发教程
全栈 100W+
主讲:Peter-Zhu 轻松幽默、简短易学,非常适合PHP学习入门
玉女心经-web前端开发教程
入门 50W+
主讲:灭绝师太 由浅入深、明快简洁,非常适合前端学习入门
天龙八部-实战开发教程
实战 80W+
主讲:西门大官人 思路清晰、严谨规范,适合有一定web编程基础学习
php中文网:公益在线php培训,帮助PHP学习者快速成长!
Copyright 2014-2021 https://www.php.cn/ All Rights Reserved | 苏ICP备2020058653号-1