1、初始化
1.1、字符串初始化
from
pyquery
import
PyQuery
as
py
html =
'''
<div>
<ul>
<li class="item-0">first item</li>
<li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li>
<li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li>
<li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = py(html)
print
(doc(
'li'
))
1.2、URL初始化
from
pyquery
import
PyQuery
as
py
#
通过
URL
来获取
doc = py(
url
=
'http://www.baidu.com'
)
# <class 'pyquery.pyquery.PyQuery'>
print
(
type
(doc(
'title'
)))
#
输出选中的
head
标签
print
(doc(
'head'
))
1.3、文件的初始化
from
pyquery
import
PyQuery
as
py
#
通过文件来获取
doc = py(
filename
=
'demo1.html'
)
# <class 'pyquery.pyquery.PyQuery'>
print
(
type
(doc(
'li'
)))
#
输出所有的
li
标签
print
(doc(
'li'
))
2、基本的CSS选择器
from
pyquery
import
PyQuery
as
py
html =
'''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li>
<li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li>
<li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = py(html)
#
选中
id
为
container
中的
class
为
list
中的
li
标签
print
(doc(
'#container .list li'
))
3、查找元素
3.1、子元素
from
pyquery
import
PyQuery
as
py
html =
'''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li>
<li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li>
<li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = py(html)
#
获取
class
为
list
的元素
items = doc(
'.list'
)
# <class 'pyquery.pyquery.PyQuery'>
print
(
type
(items))
print
(items)
#
在先前找到的元素中获取
li
标签
lis = items.find(
'li'
)
# <class 'pyquery.pyquery.PyQuery'>
print
(
type
(lis))
print
(lis)
# 获取先前找到的元素中的所有子元素
lis2 = items.children()
print(type(lis2))
print(lis2)
# 获取先前找到的元素中的class为active的元素
li3 = items.children(
'.active'
)
print(li3)
3.2、父元素
from
pyquery
import
PyQuery
as
py
html =
'''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li>
<li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li>
<li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = py(html)
#
获取
class
为
list
的元素
items = doc(
'.list'
)
#
获取所选元素的父元素
container = items.parent()
print
(
type
(container))
print
(container)
print
(
"=========================="
)
#
获取所选元素的所有父元素
parents = items.parents()
print
(
type
(parents))
print
(parents)
print
(
"=========================="
)
#
获取所选元素的所有父元素中
class
为
container
的元素
parent = items.parents(
'.container'
)
print
(parent)
3.3、兄弟元素
from
pyquery
import
PyQuery
as
py
html =
'''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li>
<li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li>
<li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = py(html)
#
获取
class
为
list
的元素
items = doc(
'.list'
)
li = doc(
'.list .item-0.active'
)
#
查找选中元素的所有兄弟元素
(
不包含自己
)
print
(li.siblings())
#
查找选中元素的所有兄弟元素中
class
为
active
的元素
(
不包含自己
)
print
(li.siblings(
'.active'
))
4、遍历
4.1、单个元素
from
pyquery
import
PyQuery
as
py
html =
'''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li>
<li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li>
<li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = py(html)
#
选中单个单个元素
li = doc(
'.item-0.active'
)
print
(li)
4.2、多个元素
from
pyquery
import
PyQuery
as
py
html =
'''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li>
<li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li>
<li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = py(html)
#
查找所有
li
标签
lis = doc(
'li'
).items()
# <class 'generator'>
print
(
type
(lis))
for
li
in
lis:
print
(li)
5、获取信息
5.1、获取属性
from
pyquery
import
PyQuery
as
py
html =
'''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li>
<li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li>
<li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = py(html)
a = doc(
'.item-0.active a'
)
# <a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a>
print
(a)
# link3.html
获取选中标签的
href
属性
print
(a.attr(
'href'
))
# link3.html
print
(a.attr.href)
5.2、获取文本
from
pyquery
import
PyQuery
as
py
html =
'''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li>
<li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li>
<li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = py(html)
a = doc(
'.item-0.active a'
)
# <a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a>
print
(a)
#
获取
a
标签的内容
print
(a.text())
5.3、获取HTML
from
pyquery
import
PyQuery
as
py
html =
'''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li>
<li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li>
<li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = py(html)
li = doc(
'.item-1.active'
)
# <li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li>
print
(li)
#
获取
li
标签的
HTML
print
(li.html())
6、DOM操作
6.1、addClass、removeClass
from
pyquery
import
PyQuery
as
py
html =
'''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li>
<li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li>
<li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = py(html)
li = doc(
'.item-0.active'
)
print
(li)
#
移除
class
li.removeClass(
'active'
)
print
(li)
#
添加
class
li.addClass(
'active'
)
print
(li)
6.2、attr、css
from
pyquery
import
PyQuery
as
py
html =
'''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li>
<li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li>
<li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = py(html)
li = doc(
'.item-0.active'
)
print
(li)
#
添加
name
属性
li.attr(
'name'
,
'link'
)
print
(li)
#
添加
css
样式
li.css(
'font-size'
,
'14px'
)
print
(li)
6.3、remove
from
pyquery
import
PyQuery
as
py
html =
'''
<div class="wrap">
Hello, World
<p>This is a paragraph.</p>
</div>
'''
doc = py(html)
wrap = doc(
'.wrap'
)
print
(wrap.text())
#
在选择的元素中找到
p
标签并移除
wrap.find(
'p'
).remove()
print
(wrap.text())
6.4、其他DOM方法
7、伪类选择器
from
pyquery
import
PyQuery
as
py
html =
'''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li>
<li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li>
<li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
doc = py(html)
#
找到第一个
li
li = doc(
'li:first-child'
)
print
(li)
#
找到最后一个
li
li = doc(
'li:last-child'
)
print
(li)
#
找到第二个
li
li = doc(
'li:nth-child(2)'
)
print
(li)
#
找到第三个到最后的
li
li = doc(
'li:gt(2)'
)
print
(li)
#
找到第偶数个
li
li = doc(
'li:nth-child(2n)'
)
print
(li)
#
找到内容包含
second
的
li
li = doc(
'li:contains(second)'
)
print
(li)
更多的选择器
8、官方文档