#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
from urllib import request
import gzip
import zlib
import re
from enum import Enum
class HTMLNodeTypes(Enum):
Element = 1
Attribute = 2
Text = 3
CDATASection = 4
Comment = 8
Document = 9
DocumentType = 10
PTN_SINGLE_TAG = re.compile("(area|base|br|col|command|embed|img|hr|keygen|link|meta|param|source|track|input|wbr)")
PTN_OMIT_TAG = re.compile("(li|dt|dd|p|rt|rp|optgroup|option|colgroup|thead|tbody|tfoot|tr|td|th)", re.I)
PTN_HTML_TAG = r"<\!DOCTYPE\s+(?P<type>HTML|XHTML|XML|WML)(?P<define>\s+PUBLIC\s+\"[^\"]+\"[\s\r\n]+\"[^\"]+\")?\s*>|<\!\-\-(?P<comment>[\s\S]+?)\-\->|<script(?P<scriptAttr>[^<>]+)>(?P<scriptContent>[\s\S]*?)<\/script>|<(?P<closeFlag>\/)?(?P<tag>[\w\-:]+)(?P<attr>(?:\s+[\w\-:]+=(?:\'[^\']+\'|\"[^\"]+\"|[^\s<>]+))*)?\s*\/?\s*>"
PTN_HTML_ATTR = r"\s+([\w\-\:]+)=([\'\"])([\s\S]*?)\2|\s+([\w\-\:]+)=([^\s<>\'\"\=]+)|\s+([\w\-\:]+)"
PTN_SELECTOR = r"(?P<tag>[\w\-]+)?((?P<flag>[\.\#])(?P<key>[\w\-]+))?(\:?P<handle>[\w\-]+)?(\[(?P<attr>[\w\-]+)(?P<compare>[\^\$\*]?)='(?P<value>[^']+)'\])?"
class HTMLAttribute:
nodeType = HTMLNodeTypes.Attribute
def __init__(self, name, value):
self.name = name
self.value = value
class HTMLTextNode:
nodeType = HTMLNodeTypes.Text
def __init__(self, data):
self.data = data
self.length = len(data)
class HTMLCDATASection(HTMLTextNode):
nodeType = HTMLNodeTypes.CDATASection
class HTMLComment(HTMLTextNode):
nodeType = HTMLNodeTypes.Comment
class HTMLElement:
nodeType = HTMLNodeTypes.Element
def __init__(self, tag=None):
self.all = []
self.attributes = {}
self.childNodes = []
self.tag = tag
self.deep = 0
self.single = False if (tag is None) else (False if (PTN_SINGLE_TAG.match(tag) is None) else True)
self.parentNode = None
self.ownerDocument = None
self.firstChild = None
self.lastChild = None
def set_attr(self, name, value):
attr = HTMLAttribute(name, value)
self.attributes[name.lower()] = attr
return attr
def get_attr(self, name):
attr = self.attributes.get(name.lower())
return None if (attr is None) else attr.value
def append(self, node):
node.parentNode = self
node.deep = self.deep + 1
if 0 == len(self.childNodes):
self.firstChild = node
self.childNodes.append(node)
self.lastChild = node
top = self
while top is not None:
top.all.append(node)
top = top.parentNode
return node
@property
def html(self):
if self.single:
return None
arr = []
for n in self.childNodes:
if HTMLNodeTypes.Text == n.nodeType:
arr.append(n.data)
elif HTMLNodeTypes.Comment == n.nodeType:
arr.append("<!--%s-->" % n.data)
elif HTMLNodeTypes.CDATASection == n.nodeType:
arr.append("<![CDATA[%s]]>" % n.data)
elif HTMLNodeTypes.DocumentType == n.nodeType:
arr.append("<!DOCTYPE %s%s>" % (n.type, n.define or ""))
elif HTMLNodeTypes.Element == n.nodeType:
arr.append(n.htmlify)
else:
raise Exception("html(Unknown node type)", 0)
return "".join(arr)
@property
def htmlify(self):
if self.tag is None:
return self.html
a = []
for k, v in self.attributes.items():
if v.value is not None:
a.append(' %s="%s"' % (v.name, v.value))
else:
a.append(' %s' % v.name)
if self.single:
return "<%s%s/>" % (self.tag, "".join(a))
return "<%(tag)s%(attr)s>%(html)s</%(tag)s>" % {"tag": self.tag, "attr": " ".join(a), "html": self.html}
@property
def text(self):
arr = []
for n in self.all:
if HTMLNodeTypes.Text == n.nodeType:
arr.append(n.data)
elif HTMLNodeTypes.Element == n.nodeType:
arr.append(n.text)
return "".join(arr)
@staticmethod
def _get_elements_by_attr(nodes, name, value):
arr = []
ptn = value if (isinstance(value, re.Pattern)) else re.compile(r"^%s$" % value, re.I)
for n in nodes:
if HTMLNodeTypes.Element != n.nodeType: continue
value2 = n.get_attr(name)
if value2 is not None and ptn.search(value2) is not None: arr.append(n)
return arr
@staticmethod
def _get_element_by_attr(nodes, name, value):
ptn = value if (isinstance(value, re.Pattern)) else re.compile(r"^%s$" % value, re.I)
for n in nodes:
if HTMLNodeTypes.Element != n.nodeType: continue
value2 = n.get_attr(name)
if value2 is not None and ptn.search(value2) is not None: return n
return None
@staticmethod
def _get_elements_by_class(nodes, value):
return HTMLElement._get_elements_by_attr(nodes, "class", value if (isinstance(value, re.Pattern)) else re.compile(r"(^|\s+)%s($|\s+)" % value, re.I))
@staticmethod
def __get_element_by_class(nodes, value):
return HTMLElement._get_element_by_attr(nodes, "class", value if (isinstance(value, re.Pattern)) else re.compile(r"(^|\s+)%s($|\s+)" % value, re.I))
@staticmethod
def _get_elements_by_tag(nodes, value):
arr = []
ptn = value if (isinstance(value, re.Pattern)) else re.compile(r"^%s$" % value, re.I)
for n in nodes:
if HTMLNodeTypes.Element != n.nodeType: continue
if n.tag is not None and ptn.search(n.tag) is not None: arr.append(n)
return arr
def get_elements_by_attr(self, name, value):
return HTMLElement._get_elements_by_attr(self.all, name, value)
def get_element_by_attr(self, name, value):
return HTMLElement._get_element_by_attr(self.all, name, value)
def get_elements_by_id(self, value):
return HTMLElement._get_elements_by_attr(self.all, "id", value)
def get_element_by_id(self, value):
return HTMLElement._get_element_by_attr(self.all, "id", value)
def get_elements_by_class(self, value):
return HTMLElement._get_elements_by_class(self.all, value)
def get_element_by_class(self, value):
return HTMLElement.__get_element_by_class(self.all, value)
def get_elements_by_tag(self, value):
return HTMLElement._get_elements_by_tag(self.all, value)
def get_nodes_by_attr(self, name, value):
return HTMLElement._get_elements_by_attr(self.childNodes, name, value)
def get_node_by_attr(self, name, value):
return HTMLElement._get_element_by_attr(self.childNodes, name, value)
def get_nodes_by_id(self, value):
return HTMLElement._get_elements_by_attr(self.childNodes, "id", value)
def get_node_by_id(self, value):
return HTMLElement._get_element_by_attr(self.childNodes, "id", value)
def get_nodes_by_class(self, value):
return HTMLElement._get_elements_by_class(self.childNodes, value)
def get_node_by_class(self, value):
return HTMLElement.__get_element_by_class(self.childNodes, value)
def get_nodes_by_tag(self, value):
return HTMLElement._get_elements_by_tag(self.childNodes, value)
def query(self, selectors):
ret = []
for selector in re.split(",", selectors):
lst = None
_all = True
for exp in re.split(r"\s+", selector.strip()):
if ">" == exp:
_all = False
else:
lst = self.__query(lst, exp, _all)
_all = True
ret = ret + lst
return ret
def __query(self, nodes, selector, _all):
p = re.match(PTN_SELECTOR, selector, re.I) or {}
m = "all" if _all else "childNodes"
f1 = ("get_elements_by_tag" if _all else "get_nodes_by_tag") if (p["tag"] is not None) else None
f2 = ("_get_elements_by_class" if _all else "_get_nodes_by_class") if ("." == p["flag"]) else (("_get_elements_by_id" if _all else "_get_nodes_by_id") if ("#" == p["flag"]) else None)
f3 = ("_get_elements_by_attr" if (f1 is not None or f2 is not None) else ("get_elements_by_attr" if _all else "get_nodes_by_attr")) if (p["attr"] is not None) else None
cp = None if (f3 is None) else self.__compare(p["compare"], p["value"])
ls = nodes if (nodes is not None) else [self]
a = []
t = []
r = []
if f1 is not None:
for n in ls:
t = t + n.__getattribute__(f1)(p["tag"])
a = t if (f2 is None) else self.__getattribute__(f2)(t, p["key"])
elif f2 is not None:
for n in ls:
a = a + n.__getattribute__(f2)(n.__getattribute__(m), p["key"])
else:
a = ls
if f3 is not None:
if f1 is not None or f2 is not None:
r = self.__getattribute__(f3)(a, p["attr"], cp)
else:
for n in a:
r = r + n.__getattribute__(f3)(p["attr"], cp)
else:
r = a
return r
@staticmethod
def __compare(cp, value):
a = ["", value, ""]
if "^" == cp:
a[0] = "^"
elif "$" == cp:
a[2] = "$"
return re.compile("".join(a), re.I)
class PYDocumentType:
nodeType = HTMLNodeTypes.DocumentType
def __init__(self, _type, define):
self.type = _type
self.define = define
class PYDocument:
nodeType = HTMLNodeTypes.Document
def __init__(self):
self.images = []
self.links = []
self.documentElement = self.create_element()
def create_attr(self, name, value):
node = HTMLAttribute(name, value)
node.ownerDocument = self
return node
def create_text(self, data):
node = HTMLTextNode(data)
node.ownerDocument = self
return node
def create_comment(self, data):
node = HTMLComment(data)
node.ownerDocument = self
return node
def create_cdata_section(self, data):
node = HTMLCDATASection(data)
node.ownerDocument = self
return node
def create_document_type(self, _type, define):
node = PYDocumentType(_type, define)
node.ownerDocument = self
return node
def create_element(self, tag=None):
node = HTMLElement(tag)
node.ownerDocument = self
return node
@property
def html(self):
return self.documentElement.html
@property
def htmlify(self):
return self.documentElement.htmlify
@property
def text(self):
return self.documentElement.text
def query(self, selector):
return self.documentElement.query(selector)
@staticmethod
def load(url, callback, encoding="utf-8"):
if re.match(r"(http|https)://.+", url, re.I) is None:
PYDocument.parse(PYDocument.get_text_contents(url, encoding), callback)
else:
PYDocument.parse(PYDocument.get_http_contents(url), callback)
@staticmethod
def get_http_contents(url):
req = request.Request(url)
req.add_header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36")
req.add_header("Accept-Encoding", "gzip, deflate")
res = request.urlopen(req)
# print("Status: ", res.status, res.reason)
# for k, v in res.getheaders():
# print("%s: %s" % (k, v))
# print(res.read().decode("gbk", "ignore"))
data = res.read()
ce = res.getheader("Content-Encoding")
ct = res.getheader("Content-Type")
mt = re.search(r"charset=(.+)", ct, re.I)
cs = None if (mt is None) else mt.group(1)
ret = None
if "gzip" == ce:
tmp = gzip.decompress(data)
if cs is None:
data = tmp.decode("utf-8", "ignore")
mt = re.search(r"<meta\s+charset=([\'\"])([^\'\"<>]+)\1[^<>]*>", data, re.I)
if mt is not None:
cs = mt.group(2)
else:
mt = re.search(r"<meta[^<>]+content=([\'\"])[^\'\"<>]+charset=([^\'\"<>]+)\1[^<>]*>", data, re.I)
if mt is not None:
cs = mt.group(2)
ret = tmp.decode(cs or "utf-8", "ignore")
# print("gzip decode: ", ret)
elif "deflate" == ce:
try:
ret = zlib.decompress(data, -zlib.MAX_WBITS)
except zlib.error:
ret = zlib.decompress(data)
else:
ret = data.decode(cs)
return ret
@staticmethod
def get_text_contents(path, encoding="utf-8"):
fp = open(path, "r", encoding=encoding)
ret = fp.read()
fp.close()
return ret
@staticmethod
def parse(data, callback=None):
pos = 0
doc = PYDocument()
parent = doc.documentElement
node = None
deep = 0
err = None
def __set_attr(mt):
node.set_attr(mt.group(1) or mt.group(4) or mt.group(6), mt.group(3) or mt.group(5))
for res in re.finditer(PTN_HTML_TAG, data):
# print(res.group(), res.span())
index = res.span()
if pos < index[0]:
parent.append(doc.create_text(data[pos:index[0]]))
pos = index[1]
tag = res.group("tag")
attrs = res.group("scriptAttr") or res.group("attr")
if res.group("type") is not None:
parent.append(doc.create_document_type(res.group("type"), res.group("define")))
elif res.group("comment") is not None:
parent.append(doc.create_comment(res.group("comment")))
elif res.group("scriptContent") is not None:
node = parent.append(doc.create_element("script"))
if attrs is not None: re.sub(PTN_HTML_ATTR, __set_attr, attrs)
node.append(doc.create_text(res.group("scriptContent")))
elif "/" == res.group("closeFlag"):
if parent.parentNode is None:
err = "Close tag missing parentNode"
break
elif re.match(tag, parent.tag, re.I) is not None:
deep = deep - 1
parent = parent.parentNode
else:
i = deep
n = parent.parentNode
#如果关闭标签与父节点不匹配,则需要回溯到上级父节点,如果没有找到上级父节点,那么忽略不计,算是容错
while i > 0 and n is not None:
if n.tag is not None and re.match(tag, n.tag, re.I):
deep = i - 1
parent = n.parentNode
break
i = i - 1
n = n.parentNode
else:
#有些标签在html5标准中可是省略结束标签,比如<p>可以不写</p>,遇见<p>hello<p>world这种情况,直接把父节点上溯一级
if parent.tag is not None and PTN_OMIT_TAG.match(tag) is not None and PTN_OMIT_TAG.match(parent.tag) is not None:
parent = parent.parentNode
node = parent.append(doc.create_element(tag))
if attrs is not None:
# print(attrs)
re.sub(PTN_HTML_ATTR, __set_attr, attrs)
if node.single:
if re.match(r"^img$", tag, re.I) is not None:
doc.images.append(node)
else:
if re.match(r"^(a|area)$", tag, re.I) is not None:
doc.links.append(node)
deep = deep + 1
parent = node
if callback is not None:
return callback(err, doc)
else:
return doc if (err is None) else err
if __name__ == '__main__':
def test(err, doc):
if err:
print("PYDocument error: %s" % err)
else:
#print(doc.htmlify)
# for n in doc.query("nav.nav a[data-clev^='10220']"): #sohu.com
i = 1
for n in doc.query(".gkklist"): # 163.com
print("list %d => %s" % (i, n.htmlify))
k = 1
#检索所有href中包含4的链接
for n2 in n.query("a[href*='4']"):
print("a %d => %s" % (k, n2.htmlify))
k = k + 1
i = i + 1
for n in doc.query("img[alt^='<']"):
print("img : %s,%s" % (n.get_attr("alt"), n.htmlify))
PYDocument.load("http://www.163.com", test)
#PYDocument.load("D:\\www\\163.txt", test, "gbk")
昨天开始接触python,做了个简单的html解析器,支持http和本地文件,类似dom,加入了一些简单的jQuery选择器语法,大致就是这种:(tag)(.|#)(class|id)[(attr)(^|$|*)='(value)']
(tag) = html标签名,语句可以省略
(.|#) = css类或者标签id标记,语句可以省略
(class|id) = css类名或id,语句可以省略
[(attr)(^|$|*)='(value)'] = (attr)为属性名,(value)为属性值查询条件,(^|$|*)为对比符号,^表示属性值开头与(value)相等,$表示属性值末尾与(value)相等,*表示属性值包括(value),(^|$|*)可以省略,直接用=符号表示属性值与(value)相等。语句可以省略。
选择器运用于PYDocument类和PYNodeElement类的query方法,执行返回一个数组[]。如果使用getElementsBy***(多选)或getElementBy***(单选)方法,查询条件则支持re.Pattern对象,比如:getElementsByClass(re.compile(r"(^|\s+)(a1|a2|a3)($|\s+)", re.I))则会检索所有含有a1、a2、a3的类名。
解析器支持html5,比如可以省略结束标签的li、 dt、 dd、 p、 option、 thead、 tbody、 tr、 td、 th、 rt、 rp、 optgroup、 colgroup、 tfoot,可以自动分辨。163主页里,div.gkklist标签里就有数条p没有结束标签,因此使用它作为解析例子,最终在htmlify(类似outerHTML)和html(类似innerHTML)方法里面,解析器已经为其自动补充结束标记。
PS:代码做了两更,算是比较完善,先是用editplus写的,没有语法提示,今天放在Pycharm里,发现很多不规则的语法,于是更改过来,比如:函数名推荐小写而不推荐驼峰命名方式,None的判断推荐is None而不推荐!= None,if条件不推荐括弧。
另外标签的正则更改了一下,<\/?\w+[^<>]*>这种方式不太满足特殊标签,163就有个img是这样的:
<img ne-lazy="effect: fadeIn" data-original="http://cms-bucket.ws.126.net/2020/0305/603d58d9j00q6p7fo0056c000gn00k5c.jpg?imageView&thumbnail=250y250&quality=85" alt="<%=bohe.stitle%>"/>
alt属性值是<%=bohe.stitle%>,因此为了应付这种特殊写法,html标签的正则改为
<(\/)?([\w\-:]+)((?:\s+[\w\-:]+=(?:\'[^\']+\'|\"[^\"]+\"|[^\s<>]+))*)?\s*\/?\s*>
这样可以通配<a href="***">与<a href='***'>与</a>与<br/>