pyquery的使用
pyquery能够很好的利用CSS选择器对网页进行解析和查询
初始化
1.字符串初始化
html='''<div>
<ul>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>'''
from pyquery import PyQuery as pq
doc=pq(html) # 构造一个PyQuery对象,长字符串当做参数传入PyQuery类
print(doc('li')) # 将初始化对象传入CSS选择器
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
2.URL初始化
from pyquery import PyQuery as pq
# PyQuery对象会首先请求url,然后用的得到的HTML内容来完成初始化
doc=pq(url="http://www.cuiqingcai.com")
print(doc('title'))
<title>静觅丨崔庆才的个人博客</title>
3.文件初始化
from pyquery import PyQuery as pq
# 先读取本地文件内容,再以字符串的形式传入PyQuery对象进行解析
doc=pq(filename='test.html')
print(doc('li'))
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</li>
基本CSS选择器
html='''<div id='container'>
<ul class='list'>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>'''
from pyquery import PyQuery as pq
doc=pq(html) # 初始化PyQuery对象
print(doc('#container .list li')) # 选取id为'container'的节点内部class为'list'的所有li节点
print(type(doc('#container .list li')))
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
<class 'pyquery.pyquery.PyQuery'>
查找节点
1.子节点
find()的查找范围是节点的所有子孙节点
html='''<div id='container'>
<ul class='list'>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>'''
from pyquery import PyQuery as pq
doc=pq(html)
items=doc('.list')
print(type(items))
print(items)
lis=items.find('li')
print(type(lis))
print(lis)
<class 'pyquery.pyquery.PyQuery'>
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
<class 'pyquery.pyquery.PyQuery'>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
children()方法是查找子节点
lis=items.children()
print(type(lis))
print(lis)
<class 'pyquery.pyquery.PyQuery'>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
4.父节点
parent()获取某个节点的父节点
html='''<div id='container'>
<ul class='list'>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>'''
from pyquery import PyQuery as pq
doc=pq(html)
items=doc('li')
container=items.parent()
print(type(container))
print(container)
<class 'pyquery.pyquery.PyQuery'>
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
parents()方法能够返回某个节点的祖先节点
from pyquery import PyQuery as pq
html='''<div id='container'>
<ul class='list'>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>'''
doc=pq(html)
items=doc('li')
parents=items.parents()
print(type(parents))
print(parents)
print(parents('div')) # 筛选祖先选择器依然传入css选择器
<class 'pyquery.pyquery.PyQuery'>
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div><ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
5.兄弟节点
用siblings()方法返回该节点所有的兄弟节点
html='''<div id='container'>
<ul class='list'>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>'''
from pyquery import PyQuery as pq
doc=pq(html)
li=doc('.list .item-0.active')
print(li.siblings())
print(li.siblings('.active')) # 挑选出兄弟节点中属性含有active的节点
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0">first item</li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
遍历
html='''<div id='container'>
<ul class='list'>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>'''
from pyquery import PyQuery as pq
doc=pq(html)
lis=doc('li').items() # 调用items()方法得到一个生成器,再进行遍历
print(type(lis))
for i in lis:
print(i)
<class 'generator'>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
获取信息
1.获取属性
html='''<div id='container'>
<ul class='list'>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>'''
from pyquery import PyQuery as pq
doc=pq(html)
a=doc('.item-0.active a')
print(a,type(a))
print(a.attr('href')) # 选定一个节点后,用方法attr()传入属性名称得到属性值
print(a.attr.href) # 也可以用属性attr来获取属性值
print("-------------------------------")
a=doc('a')
print(a.attr('href')) # 当要获取属性的节点有多个同名节点时,只返回第一个节点的属性值
print("-------------------------------")
for item in a.items():
print(item.attr('href')) # 可以通过遍历的方法来获取所有a的属性
<a href="link3.html"><span class="bold">third item</span></a> <class 'pyquery.pyquery.PyQuery'>
link3.html
link3.html
-------------------------------
link2.html
-------------------------------
link2.html
link3.html
link4.html
link5.html
2.获取文本
html='''<div id='container'>
<ul class='list'>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>'''
from pyquery import PyQuery as pq
doc=pq(html)
a=doc('.item-0.active a')
print(a)
print(a.text()) # 先获取节点再调用text()方法,返回的是节点内部的纯文本
print(a.html()) # 如果要获取节点中html文本,就要使用的html()方法
<a href="link3.html"><span class="bold">third item</span></a>
third item
<span class="bold">third item</span>
html='''<div id='container'>
<ul class='list'>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>'''
from pyquery import PyQuery as pq
doc=pq(html)
li=doc('li')
print(li.html())
print(li.text()) # 当选择多个节点时,text()方法返回所有节点的文本并用空格连接为一个字符串
print(type(li.text())) # 而html()方法,只返回第一个节点的html文本,如果要获取全部,需要遍历获取
first item
first item second item third item fourth item fifth item
<class 'str'>
节点操作
1.addClass 和 removeClass
用于对节点中class进行移除或添加
html='''<div id='container'>
<ul class='list'>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>'''
from pyquery import PyQuery as pq
doc=pq(html)
li=doc('.item-0.active')
print(li)
li.removeClass('active') # 删除节点中active这个class
print(li)
li.addClass('active') # 在节点中添加active这个class
print(li)
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
2.attr、text 和 html
分别用于添加节点的属性,文本内容,html文本
html='''
<ul class="list">
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
</ul>'''
from pyquery import PyQuery as pq
doc=pq(html)
li=doc('.item-0.active')
print(li)
li.attr('name','link') # 添加属性name='link'
print(li)
li.text('hello word') # 将li节点内部替换为'hello word'的文本
print(li)
li.html('<span>hhhhhh</span>')# 将li节点内部替换为'<span>hhhhhh</span>'的html文本
print(li)
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0 active" name="link"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0 active" name="link">hello word</li>
<li class="item-0 active" name="link"><span>hhhhhh</span></li>
3.remove()
移除某个节点
html = '''
<div class="wrap">
Hello, World
<p>This is a paragraph.</p>
</div>
'''
from pyquery import PyQuery as pq
doc=pq(html)
wrap=doc('.wrap')
print(wrap.text())
wrap.find('p').remove() #去掉div节点内部的p节点,再进行文本提取
print(wrap.text())
Hello, World
This is a paragraph.
Hello, World
伪类选择器
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc=pq(html)
li=doc('li:first-child') # first-child 伪类来选择元素的第一个子元素。
print(li)
li=doc('li:last-child') # last-child 伪类来选择元素的最后一个子元素。
print(li)
li=doc('li:nth-child(2)') # 选择所有li元素的父元素的第二个子元素
print(li)
li=doc('li:gt(2)') # 选中li中index值(从0开始)大于2的元素
print(li)
li=doc('li:nth-child(2n)') # 选择所有li元素的父元素的第偶数个子元素
print(li)
li=doc('li:contains(second)') # 包含某一文本的节点
print(li)
<li class="item-0">first item</li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
参考:崔庆才《python3网络爬虫开发实战》