17-Python 3.x urllib.request使用(2)

最新推荐文章于 2024-07-12 16:16:27 发布

chitian6393

最新推荐文章于 2024-07-12 16:16:27 发布

阅读量108

点赞数

文章标签： python

原文链接：https://my.oschina.net/CoderW/blog/1068767

版权

1.HTML解析(html.parser.HTMLParser)

①、自定义一个继承自html.parser.HTMLParser的类

②、重写相关方法

def handle_starttag(self, tag, attrs):开始标签+属性键值对

def handle_data(self, data):标签中的数据内容

def handle_endtag(self, tag):结束标签

def handle_startendtag(self, tag, attrs):单标签+属性键值对

def handle_decl(self, decl):描述信息

2.HTML解析使用

#!/usr/bin/python
# -*- coding: UTF-8 -*-

__author__ = 'apple'

import html.parser
import enum
import re

class HTMLParserType(enum.Enum):
    Table  = 0
    Links  = 1
    Images = 2

class zHTMLParser(html.parser.HTMLParser):

    def __init__(self,row_tag=None,data_tag=[],attr_dict={}):
        html.parser.HTMLParser.__init__(self)
        # 跳过筛选的内容
        self.exclude_list = zHTMLParser.Illegal_list()
        # 需要检测的标签
        self.row_tag = row_tag
        self.data_tag = data_tag
        self.attr_dict = attr_dict
        # 当前标签
        self._cur_tag = ''
        self._items = []
        # 解析结果数据
        self.results = []

    # 开始解析
    def start_parser(self,raw_data):
        if raw_data is None : return
        if isinstance(raw_data,str):
            self.feed(raw_data)
            self.close()
        elif isinstance(raw_data,list):
            for data in raw_data:
                self.feed(data)
            self.close()
        else:
            raise ValueError('HTMLParser原始数据必须是字符串/数组！')

    # 获取Html Tag参数集:
    # 返回参数(row_tag,data_tag,attr_dict)
    @classmethod
    def tag_by_type(cls,type):
        if type == HTMLParserType.Table:
            return ('tr', ['th', 'td'], None)
        elif type == HTMLParserType.Links:
            return ('a', [], {'href': None})
        elif type == HTMLParserType.Images:
            return ('img', [], {'src': None})

    # 重写父类方法
    def handle_starttag(self, tag, attrs):
        self._cur_tag = tag

        # 扩展键值数据获取
        self.update_attr_dict(tag, attrs)

    def handle_data(self, data):
        if data in self.exclude_list : return
        # 主文本数据
        if self._cur_tag in self.data_tag:
            data = re.sub('\s', '', data)
            if data == '' : return
            self._items.append(data)

    def handle_endtag(self, tag):
        if tag in self.row_tag:
            if self.attr_dict is not None:
                # 扩展数据
                self._items.append(self.attr_dict.copy())
            # 保存链接数据
            self.results.append(self._items.copy())
            # 清除上一轮参数
            self._cur_tag = ''
            self._items.clear()

    # 单标签
    def handle_startendtag(self, tag, attrs):
        # 扩展键值数据获取
        self.update_attr_dict(tag, attrs)

    # 描述信息
    def handle_decl(self, decl):
        print('描述信息：',decl)

    # 更新扩展键值数据
    def update_attr_dict(self,tag,attrs):
        # 转换dict
        attr_dict = {}
        for (key,value) in attrs:
            attr_dict[key] = value

        isRow = tag == self.row_tag
        if isinstance(self.attr_dict, dict) and isRow:
            # 扩展字典键值数据
            for key in self.attr_dict.keys():
                if key in attr_dict.keys():
                    self.attr_dict[key] = attr_dict[key]
                else:
                    self.attr_dict[key] = None

    @staticmethod
    def Illegal_list():
        return [' ','&nbsp']

3.结果

# 列车时刻表
request = zRequest.zHttpRequest.urlopen_opener('http://qq.ip138.com/train/anhui/HeFei.htm')
results = request.result('gb2312')
# print('请求结果:',results,'\n',type(results))
main_table = re.findall(re.compile(r"""<table[\w\W]+?id="checilist"[\w\W]+?</table>"""),results)
# 表格行数组HTML
tr_ary = re.findall(re.compile(r"<tr.+?</tr>"),main_table[0])
# HTML解析
row_tag,data_tag,attr_dict = zParseHelp.zHTMLParser.tag_by_type(zParseHelp.HTMLParserType.Table)
html_parser = zParseHelp.zHTMLParser(row_tag,data_tag,{'style':None,'onmouseover':None})
html_parser.start_parser(tr_ary)
# 时刻表打印
for item in html_parser.results:
    print(item)
print('\n\n')


#['车次', '列车类型', '始发站', '始发时间', '经过站', '经过站', '经过站', '终点站', '到达时间', {'style': "cursor:hand;'", 'onmouseover': None}]
#['普快', '杭州', '12:38', '合肥', '当天18:37', '18:57', '西安', '10:42', {'style': None, 'onmouseover': "this.bgColor='#E6F2E7';"}]
#['普快', '西安', '13:23', '合肥', '第2日04:58', '05:22', '杭州', '11:21', {'style': None, 'onmouseover': "this.bgColor='#E6F2E7';"}]
#['普快', '郑州', '23:52', '合肥', '第2日07:55', '08:08', '杭州', '14:06', {'style': None, 'onmouseover': "this.bgColor='#E6F2E7';"}]
#['普快', '杭州', '15:31', '合肥', '当天21:42', '21:58', '郑州', '07:54', {'style': None, 'onmouseover': "this.bgColor='#E6F2E7';"}]
#['普快', '合肥', '08:40', '合肥', '当天08:40', '08:40', '库尔勒', '05:11', {'style': None, 'onmouseover': "this.bgColor='#E6F2E7';"}]
#['普快', '库尔勒', '16:37', '合肥', '第4日14:03', '14:03', '合肥', '14:03', {'style': None, 'onmouseover': "this.bgColor='#E6F2E7';"}]
#['普快', '成都', '03:20', '合肥', '第2日02:20', '02:32', '上海', '13:16', {'style': None, 'onmouseover': "this.bgColor='#E6F2E7';"}]
#['普快', '上海', '13:32', '合肥', '当天23:59', '00:14', '成都', '00:25', {'style': None, 'onmouseover': "this.bgColor='#E6F2E7';"}]
#['普快', '成都', '16:10', '合肥', '第3日04:52', '05:04', '上海', '14:54', {'style': None, 'onmouseover': "this.bgColor='#E6F2E7';"}]
#['普快', '上海', '16:25', '合肥', '第2日02:05', '02:20', '成都', '15:43', {'style': None, 'onmouseover': "this.bgColor='#E6F2E7';"}]
#['普快', '信阳', '13:00', '合肥', '当天18:53', '19:07', '上海', '04:49', {'style': None, 'onmouseover': "this.bgColor='#E6F2E7';"}]
#['普快', '上海', '05:49', '合肥', '当天14:32', '14:49', '信阳', '20:09', {'style': None, 'onmouseover': "this.bgColor='#E6F2E7';"}]
#['普快', '东莞东', '01:20', '合肥', '第2日00:21', '00:40', '徐州', '08:25', {'style': None, 'onmouseover': "this.bgColor='#E6F2E7';"}]

转载于:https://my.oschina.net/CoderW/blog/1068767

chitian6393

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
17-Python 3.x urllib.request使用(2)

1.HTML解析(html.parser.HTMLParser) ①、自定义一个继承自html.parser.HTMLParser的类 ②、重写相关方法 def handle_starttag(self, tag, attrs):开始标签+属性键值对 ...
复制链接

扫一扫