import json import re # For python 3.x from html.parser import HTMLParser def read_htmlFile(address): htmlFile = open(address, 'r') content = htmlFile.read() return content def clean_content(content): # delete tag <font> pat = re.compile('<font([^>]*)>|</font>',re.I) content = re.sub(pat, '', content) # delete tag <p> pat = re.compile('<p>|</p>',re.I) content = re.sub(pat, '', content) # replace tag <br> pat = re.compile('<br>',re.I) content = re.sub(pat, '', content) pat = re.compile('<hr([^>]*)>',re.I) content = re.sub(pat, '', content) pat = re.compile('<img([^>]*)>',re.I) content = re.sub(pat, '', content) pat = re.compile('<body([^>]*)>',re.I) content = re.sub(pat, '', content) pat = re.compile('<meta([^>]*)>',re.I) content = re.sub(pat, '', content) pat = re.compile('<head>',re.I) content = re.sub(pat, '', content) return content def print_items(items): print(items) # 对解析后的数据进行相应操作并打印 for item in parser.data: if item.startswith("{\"columns\""): payloadDict = json.loads(item) list = payloadDict["payload"]["rows"] for backlog in list: if backlog[1] == "Product Backlog Item" or backlog[1] == "Bug": print(backlog[2], " Point: ", backlog[3]) # 定义HTMLParser的子类,用以复写HTMLParser中的方法 class MyHTMLParser(HTMLParser): # 构造方法,定义data数组用来存储html中的数据 def __init__(self): HTMLParser.__init__(self) self.data = [] self.c_items = 5 # self.flag_title = 0 # self.flag_time = 0 # self.flag_source = 0 # self.flag_text = 0 # self.flag_others = 0 self.flag_begin = 0 self.flag_finish = 0 self.flag_get_data = 0 self.s_input_tag = [] self.cnt = 0 self.index = 0 self.aim_state = [[],#remind for future use, and tag <table> make the index +1, so it count from int 1 ['html', 'table', 'tr', 'td', 'nowrap'],#info ['html', 'table', 'tr', 'td', 'b'],#title ['html', 'table', 'tr', 'td'],#text ['html', 'table', 'tr', 'td']]#No. # 覆盖starttag方法,可以进行一些打印操作 def handle_starttag(self, tag, attrs): if tag == 'a': if attrs[0][0] == 'name' and attrs[0][1] != 'TOC': self.cnt += 1 # self.data.append('\n\n---------- text ' + str(self.cnt) + ' ----------') self.data.append('begin:\n') # begin a new processing for a next title self.flag_begin = 1 self.flag_finish = 0 self.index = 0 if tag == 'table': self.index += 1 #for every item stored in a table of the html file if self.index == self.c_items: self.flag_begin = 0 self.flag_finish = 1 # self.data.append('--finish-#\n') self.flag_get_data = 0 self.s_input_tag.append(tag) if self.flag_begin == 1: self.check_state() def handle_endtag(self, tag): if self.s_input_tag[-1] == tag: self.s_input_tag.pop() def handle_data(self, data): if self.flag_get_data == 1: tmp_str = str(data) tmp_str = tmp_str.replace('\n','') # tmp_str = tmp_str.replace(' ','') #Delete Chinese space # self.data.append('data'+str(self.index)+':'+tmp_str+'\n') self.data.append(tmp_str+'\n') # print('Data:',tmp_str) self.flag_get_data += 1 # decide whether get data or not, by comparing the tag chain with patten pre-defined def check_state(self): if self.flag_finish == 0: if self.s_input_tag == self.aim_state[self.index]: self.flag_get_data += 1 def get_data(self): return self.data if __name__ == '__main__': # read html file on local content = read_htmlFile(r"/home/raleve/PycharmProjects/xml_xls/bc/bc_3.html") # content = read_htmlFile(r"/home/raleve/PycharmProjects/xml_xls/bc/test.html") content = clean_content(content) # print(content) outfile = open(r'/home/raleve/PycharmProjects/xml_xls/bc/output1.txt', 'w') parser = MyHTMLParser() # make a subclass instance # 将html数据传给解析器进行解析 parser.feed(content) items = parser.get_data() for item in items: print(item,end='') outfile.write(item) print('finish') outfile.close()
# step 1-1 import parse_html as ph import os # read html file on local # htmlFile = open(r"/home/raleve/PycharmProjects/xml_xls/bc/test.html", 'r') # htmlFile = open(r"/home/raleve/PycharmProjects/xml_xls/bc/bc_1.html", 'r') # content = htmlFile.read() address_dir = r'/home/raleve/PycharmProjects/xml_xls/bc/' address_filename = r'bc_' address_filetype = r'.html' address_fileid = 1 htmlFile_address = address_dir + address_filename + str(address_fileid) + address_filetype outfile = open(address_dir+'output.txt','w') while os.access(htmlFile_address,mode=os.R_OK): print('Processing',htmlFile_address,'\n') # outfile.write('\n\n'+htmlFile_address) content = ph.read_htmlFile(htmlFile_address) content = ph.clean_content(content) parser = ph.MyHTMLParser() # make a subclass instance # 将html数据传给解析器进行解析 parser.feed(content) items = parser.get_data() for item in items: print(item,end='') outfile.write(item) # make a next address address_fileid += 1 htmlFile_address = address_dir + address_filename + str(address_fileid) + address_filetype outfile.close()