pyquery 的初步了解(实例引入)
简单举例
from pyquery import PyQuery as pq
html = '''
<div>
<ul>
<li class="item-O"><a href="linkl.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
doc = pq(html)
print(doc)
# 输出:
<div>
<ul>
<li class="item-O"><a href="linkl.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</li></ul>
</div>
字符串
from pyquery import PyQuery as pq
import requests
# doc1 与 doc2 功能相同
doc1 = pq(url='https://www.cnblogs.com/liyihua/')
print(doc1('title'))
doc2 = pq(requests.get('https://www.cnblogs.com/liyihua/').text)
print(doc1('title'))
1# 输出:
<title>李亦华 - 博客园</title>
<title>李亦华 - 博客园</title>
URL
from pyquery import PyQuery as pq
doc = pq(filename='test.html')
print(doc('li'))
# 输出:
<li class="item-O"><a href="linkl.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</li>
# 文件内容:
<div>
<ul>
<li class="item-O"><a href="linkl.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
pyquery 中的基本CSS选择器
实例切入:
from pyquery import PyQuery as pq
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = pq(html)
print(doc('#container .list li'))
print(
type(
doc('#container .list li')
)
)
# 输出:
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
<class 'pyquery.pyquery.PyQuery'>
查找节点
获取子孙节点
说明:find()方法查找的是所有子孙节点,如果只查找子节点,可以使用children()方法。
from pyquery import PyQuery
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = PyQuery(html)
items = doc('.list')
print(
type(items),
items,
sep='\n'
)
print(
type(items.find('li')),
items.find('li'),
sep='\n'
)
# 输出:
<class 'pyquery.pyquery.PyQuery'>
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
<class 'pyquery.pyquery.PyQuery'>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
获取父节点
from pyquery import PyQuery
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
1</div>
13 '''
doc = PyQuery(html)
items = doc('.list')
print(items, '\n')
print(
type(items.parent()),
items.parent(),
sep='\n'
)
# 输出:
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
<class 'pyquery.pyquery.PyQuery'>
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
兄弟节点
from pyquery import PyQuery
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = PyQuery(html)
# 选择class为list的节点内部class为item-0和active的节点
items = doc('.list .item-0.active')
print(
type(items.siblings()),
items.siblings(),
sep='\n'
)
print("\n", items.siblings('.active'))
# 输出:
<class 'pyquery.pyquery.PyQuery'>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0">first item</li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
遍历节点
from pyquery import PyQuery
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = PyQuery(html)
lis = doc('li').items() # 调用items()方法,得到一个生成器
for li in lis:
print(
li,
type(li)
)
# 输出:
<li class="item-0">first item</li>
<class 'pyquery.pyquery.PyQuery'>
<li class="item-1"><a href="link2.html">second item</a></li>
<class 'pyquery.pyquery.PyQuery'>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<class 'pyquery.pyquery.PyQuery'>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<class 'pyquery.pyquery.PyQuery'>
<li class="item-0"><a href="link5.html">fifth item</a></li>
<class 'pyquery.pyquery.PyQuery'>
获取信息
-
attr()方法获取属性
from pyquery import PyQuery html = ''' <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> ''' doc = PyQuery(html) a = doc('.item-0.active a') print( a, type(a), a.attr('href'), # 也可以用a.attr.href,两者作用相同 sep='\n' )
# 输出: <a href="link3.html"><span class="bold">third item</span></a> <class 'pyquery.pyquery.PyQuery'> link3.html
-
text()方法获取文本
from pyquery import PyQuery html = ''' <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> ''' doc = PyQuery(html) li = doc('li') print( li.html(), # 获取节点的内部文本 li.text(), # 获取节点文本,返回结果是纯文字内容 type(li.text()), sep='\n' )
# 输出: first item first item second item third item fourth item fifth item <class 'str'>
节点操作
添加和移除class
add_class() 和 remove_class() ---- 添加class、移除class
from pyquery import PyQuery
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = PyQuery(html)
li = doc('.item-0.active')
print(li)
print(li.remove_class('active'))
print(li.add_class('active'))
# 输出:
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
attr、text 和 html 方法
from pyquery import PyQuery
html = '''
<div id="container">
<ul class="list">
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
</ul>
</div>
'''
doc = PyQuery(html)
li = doc('.item-0.active')
print(li)
li.attr('name', 'link') # 添加属性name,属性值为link
print(li)
li.text('change item') # 将节点内部的内容改为'change item'
print(li)
li.html('<span>change item</span>') # 将节点内部的内容改为'<span>change item</span>'
print(li)
# 输出:
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0 active" name="link"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0 active" name="link">change item</li>
<li class="item-0 active" name="link"><span>change item</span></li>
# 输出:
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0 active" name="link"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0 active" name="link">change item</li>
<li class="item-0 active" name="link"><span>change item</span></li>
删除节点
from pyquery import PyQuery
html = '''
<div class="LeeHua">
LiYihua
<ul class="201802004731">liyihua</ul>
</div>
'''
doc = PyQuery(html)
Leehua = doc('.LeeHua')
print("移除节点ul前的输出:\n"+Leehua.text())
Leehua.find('ul').remove()
print("移除节点ul后的输出:\n"+Leehua.text())
# 输出:
移除节点ul前的输出:
LiYihua
liyihua
移除节点ul后的输出:
LiYihua
伪选择器
示例:
from pyquery import PyQuery
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
doc = PyQuery(html)
# 选择属于父元素的第一个子元素的每个 <li> 元素。
li = doc('li:first-child')
print(li)
# 选择属于父元素的最后一个子元素的每个 <li> 元素。
li = doc('li:last-child')
print(li)
# 选择属于其父元素的第二个子元素的每个 <li> 元素
li = doc('li:nth-child(2)')
print(li)
# 选择属于其父元素的最后两个子元素的每个 <li> 元素
li = doc('li:gt(2)')
print(li)
# 选择属于父元素的第偶个子元素的每个 <li> 元素。
li = doc('li:nth-child(2n)')
print(li)
# 选择包含'second'的每个元素
li = doc('li:contains(second)')
print(li)
# 输出:
<li class="item-0">first item</li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
CSS 选择器的用法:http://www.w3school.com.cn/cssref/css_selectors.asp
作者:Lee Hua