速度对比
- selectolax对比lxml(也就是xpath)
- 对比10000个文本解析
- 效率约为lxml的两倍
# -*- coding: utf-8 -*-
# @Author : markadc
import time
import requests
from lxml import etree
from selectolax.parser import HTMLParser
url = "https://www.baidu.com"
html = requests.get(url).text
print("使用lxml")
begin = time.time()
for _ in range(10000):
tree = etree.HTML(html)
print("耗时 {}秒".format(time.time() - begin))
print("使用selectolax")
begin = time.time()
for _ in range(10000):
tree = HTMLParser(html)
print("耗时 {}秒".format(time.time() - begin))
先安装一下
pip install selectolax
- 不支持xpath语法,我的心好痛
- 被迫学习了css选择器
实战
html = '''
<demo>
<h1>============欢迎靓仔进来学习============</h1>
<div id='div1'>
<a href="htt://blog.csdn.net/MarkAdc/article/details/107736445">小明</a>
<a href="https://blog.csdn.net/MarkAdc">小白</a>
<a href="2535.png">大牛</a>
<a href="1241.jpg" id="python375">小黄</a>
<a id="python275">小红</a>
</div>
<div id='div2'>
<a id="python3"></a>
<a href="2267.png"></a>
</div>
</demo>
'''
准备工作
from selectolax.parser import HTMLParser
tree = HTMLParser(html)
查找demo节点下的所有子节点
- demo > *
for node in tree.css("demo > *"):
print(node.tag)
查找demo节点下的所有子孙节点
- demo >> *
for node in tree.css("demo >> *"):
print(node.tag)
查找href属性值含有http的所有节点
- [href*=“http”]
for node in tree.css('[href*="http"]'):
args = node.tag.ljust(10), node.text(strip=True).ljust(10), node.attributes
print("标签={} 文本={} 属性={}".format(*args))
查找所有href属性值以https开头的a节点
- a[href^=‘https’]
for node in tree.css("a[href^='https']"):
args = node.tag.ljust(10), node.text(strip=True).ljust(10), node.attributes
print("标签={} 文本={} 属性={}".format(*args))
查找所有href属性值以png结尾的节点
- [href$=‘png’]
for node in tree.css("[href$='png']"):
args = node.tag.ljust(10), node.text(strip=True).ljust(10), node.attributes
print("标签={} 文本={} 属性={}".format(*args))
查找有id属性的a节点或者li节点
- a[id], li[id]
for node in tree.css("a[id], li[id]"): #
args = node.tag.ljust(10), node.text(strip=True).ljust(10), node.attributes
print("标签={} 文本={} 属性={}".format(*args))
查找id值含有python的所有节点
- *[id*=‘python’]
for node in tree.css("*[id*='python']"):
args = node.tag.ljust(10), node.text(strip=True).ljust(10), node.attributes
print("标签={} 文本={} 属性={}".format(*args))