1.HTML解析(html.parser.HTMLParser)
①、自定义一个继承自html.parser.HTMLParser的类
②、重写相关方法
def handle_starttag(self, tag, attrs):开始标签+属性键值对
def handle_data(self, data):标签中的数据内容
def handle_endtag(self, tag):结束标签
def handle_startendtag(self, tag, attrs):单标签+属性键值对
def handle_decl(self, decl):描述信息
2.HTML解析使用
#!/usr/bin/python
# -*- coding: UTF-8 -*-
__author__ = 'apple'
import html.parser
import enum
import re
class HTMLParserType(enum.Enum):
Table = 0
Links = 1
Images = 2
class zHTMLParser(html.parser.HTMLParser):
def __init__(self,row_tag=None,data_tag=[],attr_dict={}):
html.parser.HTMLParser.__init__(self)
# 跳过筛选的内容
self.exclude_list = zHTMLParser.Illegal_list()
# 需要检测的标签
self.row_tag = row_tag
self.data_tag = data_tag
self.attr_dict = attr_dict
# 当前标签
self._cur_tag = ''
self._items = []
# 解析结果数据
self.results = []
# 开始解析
def start_parser(self,raw_data):
if raw_data is None : return
if isinstance(raw_data,str):
self.feed(raw_data)
self.close()
elif isinstance(raw_data,list):
for data in raw_data:
self.feed(data)
self.close()
else:
raise ValueError('HTMLParser原始数据必须是字符串/数组!')
# 获取Html Tag参数集:
# 返回参数(row_tag,data_tag,attr_dict)
@classmethod
def tag_by_type(cls,type):
if type == HTMLParserType.Table:
return ('tr', ['th', 'td'], None)
elif type == HTMLParserType.Links:
return ('a', [], {'href': None})
elif type == HTMLParserType.Images:
return ('img', [], {'src': None})
# 重写父类方法
def handle_starttag(self, tag, attrs):
self._cur_tag = tag
# 扩展键值数据获取
self.update_attr_dict(tag, attrs)
def handle_data(self, data):
if data in self.exclude_list : return
# 主文本数据
if self._cur_tag in self.data_tag:
data = re.sub('\s', '', data)
if data == '' : return
self._items.append(data)
def handle_endtag(self, tag):
if tag in self.row_tag:
if self.attr_dict is not None:
# 扩展数据
self._items.append(self.attr_dict.copy())
# 保存链接数据
self.results.append(self._items.copy())
# 清除上一轮参数
self._cur_tag = ''
self._items.clear()
# 单标签
def handle_startendtag(self, tag, attrs):
# 扩展键值数据获取
self.update_attr_dict(tag, attrs)
# 描述信息
def handle_decl(self, decl):
print('描述信息:',decl)
# 更新扩展键值数据
def update_attr_dict(self,tag,attrs):
# 转换dict
attr_dict = {}
for (key,value) in attrs:
attr_dict[key] = value
isRow = tag == self.row_tag
if isinstance(self.attr_dict, dict) and isRow:
# 扩展字典键值数据
for key in self.attr_dict.keys():
if key in attr_dict.keys():
self.attr_dict[key] = attr_dict[key]
else:
self.attr_dict[key] = None
@staticmethod
def Illegal_list():
return [' ',' ']
3.结果
# 列车时刻表
request = zRequest.zHttpRequest.urlopen_opener('http://qq.ip138.com/train/anhui/HeFei.htm')
results = request.result('gb2312')
# print('请求结果:',results,'\n',type(results))
main_table = re.findall(re.compile(r"""
# 表格行数组HTML
tr_ary = re.findall(re.compile(r"
"),main_table[0])# HTML解析
row_tag,data_tag,attr_dict = zParseHelp.zHTMLParser.tag_by_type(zParseHelp.HTMLParserType.Table)
html_parser = zParseHelp.zHTMLParser(row_tag,data_tag,{'style':None,'onmouseover':None})
html_parser.start_parser(tr_ary)
# 时刻表打印
for item in html_parser.results:
print(item)
print('\n\n')
#['车次', '列车类型', '始发站', '始发时间', '经过站', '经过站', '经过站', '终点站', '到达时间', {'style': "cursor:hand;'", 'onmouseover': None}]
#['普快', '杭州', '12:38', '合肥', '当天18:37', '18:57', '西安', '10:42', {'style': None, 'onmouseover': "this.bgColor='#E6F2E7';"}]
#['普快', '西安', '13:23', '合肥', '第2日04:58', '05:22', '杭州', '11:21', {'style': None, 'onmouseover': "this.bgColor='#E6F2E7';"}]
#['普快', '郑州', '23:52', '合肥', '第2日07:55', '08:08', '杭州', '14:06', {'style': None, 'onmouseover': "this.bgColor='#E6F2E7';"}]
#['普快', '杭州', '15:31', '合肥', '当天21:42', '21:58', '郑州', '07:54', {'style': None, 'onmouseover': "this.bgColor='#E6F2E7';"}]
#['普快', '合肥', '08:40', '合肥', '当天08:40', '08:40', '库尔勒', '05:11', {'style': None, 'onmouseover': "this.bgColor='#E6F2E7';"}]
#['普快', '库尔勒', '16:37', '合肥', '第4日14:03', '14:03', '合肥', '14:03', {'style': None, 'onmouseover': "this.bgColor='#E6F2E7';"}]
#['普快', '成都', '03:20', '合肥', '第2日02:20', '02:32', '上海', '13:16', {'style': None, 'onmouseover': "this.bgColor='#E6F2E7';"}]
#['普快', '上海', '13:32', '合肥', '当天23:59', '00:14', '成都', '00:25', {'style': None, 'onmouseover': "this.bgColor='#E6F2E7';"}]
#['普快', '成都', '16:10', '合肥', '第3日04:52', '05:04', '上海', '14:54', {'style': None, 'onmouseover': "this.bgColor='#E6F2E7';"}]
#['普快', '上海', '16:25', '合肥', '第2日02:05', '02:20', '成都', '15:43', {'style': None, 'onmouseover': "this.bgColor='#E6F2E7';"}]
#['普快', '信阳', '13:00', '合肥', '当天18:53', '19:07', '上海', '04:49', {'style': None, 'onmouseover': "this.bgColor='#E6F2E7';"}]
#['普快', '上海', '05:49', '合肥', '当天14:32', '14:49', '信阳', '20:09', {'style': None, 'onmouseover': "this.bgColor='#E6F2E7';"}]
#['普快', '东莞东', '01:20', '合肥', '第2日00:21', '00:40', '徐州', '08:25', {'style': None, 'onmouseover': "this.bgColor='#E6F2E7';"}]