PyQuery
强大又灵活的网页解析库。如果你觉得正则写起来太麻烦,如果你觉得BeautifulSoup语法太难记,如果你熟悉jQuery的语法,那么PyQuery就是你的最佳选择。
字符串初始化
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html) # 声明pyquery对象
print(doc('li')) #同样使用CSS选择器,规则相似
输出
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
URL初始化
from pyquery import PyQuery as pq
doc = pq(url="http://www.baidu.com") #自动请求链接,并返回html
#另外还可以通过向filename传递参数进行文件初始化
print(doc('head'))
输出
<head><meta http-equiv="content-type" content="text/html;charset=utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=Edge"/><meta content="always" name="referrer"/><link rel="stylesheet" type="text/css" href="http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css"/><title>ç™¾åº¦ä¸€ä¸‹ï¼Œä½ å°±çŸ¥é“</title></head>
基本CSS选择器
from pyquery import PyQuery as pq
doc = pq(html) # html见前例
print(doc("#container .list li"))
#之间不一定非要有父子关系,只需要有层级关系
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
查找元素
1.子元素
from pyquery import PyQuery as pq
doc = pq(html)
items = doc(".list")
print(type(items))
print(items)
lis = items.find("li") # 所有结果
print(type(lis))
print(lis)
输出
<class 'pyquery.pyquery.PyQuery'>
#为pyquery对象说明可以调用与之相关的一切方法
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
<class 'pyquery.pyquery.PyQuery'>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
from pyquery import PyQuery as pq
doc = pq(html)
items = doc(".list")
lis = items.children()
# 查找直接子元素,也可以向children中传递参数用以筛选
print(items)
print(lis)
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
2.父元素
from pyquery import PyQuery as pq
doc = pq(html)
items = doc(".list")
container = items.parent() #有且仅有一个父节点
print(type(container))
print(container)
输出
<class 'pyquery.pyquery.PyQuery'>
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc(".list")
parents = items.parents()
#所有的祖先节点,每次输出一遍,可传入CSS选择器再进行筛选
print(type(parents))
print(parents)
输出
<class 'pyquery.pyquery.PyQuery'>
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div><div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
3.兄弟元素
from pyquery import PyQuery as pq
doc = pq(html)
li = doc(".list .item-0.active")
#前面class=list代表要在此标签中寻找,
#后面两个class之间无空格代表要求class同时满足item-0与active
print(li.siblings()) #此时会输出除筛选标签以外的所有兄弟元素,
#还可以传入参数,例如”active”,则会在结果中筛选带有active的兄弟元素
输出
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0">first item</li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
遍历
from pyquery import PyQuery as pq
doc = pq(html)
li = doc("li").items()
print(type(li))
for i in li:
print(i) #每一个i又是一个pyquery元素,可以使用pyquery方法
输出
<class 'generator'>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
获取信息
获取属性
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a') #空格表示a位于前者之内
print(a)
print(a.attr('href')) #获取属性
print(a.attr.href) #获取属性
输出
<a href="link3.html"><span class="bold">third item</span></a>
link3.html
link3.html
获取文本
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a')
print(a)
print(a.text()) # 选中文字
输出
<a href="link3.html"><span class="bold">third item</span></a>
third item
获取html
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active')
print(a)
print(a.html()) #除去li标签,剩下的内容
输出
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<a href="link3.html"><span class="bold">third item</span></a>
DOM操作
addClass、removeClass
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.remove_class('active') # 移除
print(li)
li.add_class('active') # 增添
print(li)
输出
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
attr、css
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.attr('name','link') #若name属性不存在,则会添加;
#若已存在,则改变
print(li)
li.css('font-size','14px') #添加style属性
print(li)
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0 active" name="link"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0 active" name="link" style="font-size: 14px"><a href="link3.html"><span class="bold">third item</span></a></li>
remove等等其他DOM方法
伪类选择器
from pyquery import PyQuery as pq
doc = pq(html)
li1 = doc('li:first-child') #第一个子标签
li2 = doc('li:last-child') #最后一个子标签
li3 = doc('li:nth-child(2)') #索引为2的子标签
li4 = doc('li:gt(2)') #索引比2大的标签,从0计数
li5 = doc('li:nth_child(2n)') #索引为偶数的子标签
li6 = doc('li:contains(second)') #子标签中含有second字样的标签
print(li1,"\n",li2,"\n",li3,"\n",li4,"\n",li5,"\n",li6)
输出
<li class="item-0">first item</li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>