一、初始化
(一)html代码初始化
from pyquery import PyQuery as pq
html = '''
<div>
<ul>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>'''
doc = pq(html)
print(doc("div"))
# <div>
# <ul>
# <li class="item-0">first item</li>
# <li class="item-1"><a href="link2.html">second item</a></li>
# <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
# <li class="item-1 active"><a href="link4.html">fourth item</a></li>
# <li class="item-0"><a href="link5.html">fifth item</a></li>
# </ul>
# </div>
(二)url初始化
doc = pq(url="http://www.baidu.com")
print(doc("html"))
# <html> <head><meta http-equiv=
(三)文件初始化
from pyquery import PyQuery as pq
file_name = "html.txt"
doc = pq(filename=file_name)
print(doc("div"))
二、返回的pyquery对象可以接受CSS选择器参数
from pyquery import PyQuery as pq
html = '''
<div id="aaa">
<ul class="bbb">
<ccc class="item-0">first item</ccc>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>'''
doc = pq(html)
print(doc("#aaa .bbb ccc"))
# <ccc class="item-0">first item</ccc>
三、查找元素
1、Pyquery.find()查找所有子元素
doc = pq(doc)
item = doc("#aaa")
print(item)
sub_item = item(".bbb")
print(sub_item)
lis = sub_item.find("li")
print(lis)
2、Pyquery.children()查找所有直接子元素
from pyquery import PyQuery as pq
html = '''
<div id="aaa">
<ul class="bbb">
<ccc class="item-0">first item</ccc>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>'''
doc = pq(html)
print(doc.find(".bbb").children(".active"))
# <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
# <li class="item-1 active"><a href="link4.html">fourth item</a></li>
#
3、PyQuery.parent()查找直接父元素,由于父元素唯一,因此可以不提供参数
from pyquery import PyQuery as pq
html = '''
<div id="aaa">
<ul class="bbb">
<ccc class="item-0">first item</ccc>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>'''
doc = pq(html)
item = doc.find("ccc")
print(item.parents(".bbb"))
4、PyQuery.siblngs()查找所有的兄弟元素
from pyquery import PyQuery as pq
html = '''
<div id="aaa">
<ul class="bbb">
<ccc class="item-0">first item</ccc>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>'''
doc = pq(html)
item = doc.find("ccc")
print(item.siblings(".item-0.active"))
# <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
四、遍历
含有多个元素的PyQuery()对象,可以使用.items()获取到一个生成器类型,从而for语法遍历所有元素
doc = pq(html)
for item in doc.find("li").items():
print(item)
# <li class="item-1"><a href="link2.html">second item</a></li>
#
# <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
#
# <li class="item-1 active"><a href="link4.html">fourth item</a></li>
#
# <li class="item-0"><a href="link5.html">fifth item</a></li>
五、获取信息
(一)PyQuery.attr[“属性名”], PyQuery.attr.属性名获取属性
doc = pq(html)
item = doc.find("ccc")
print(item.attr.class_)
print(item.attr["class"])
(二)PyQuery.text()获取标签内文本
doc = pq('<ccc class="item-0">first item</ccc>')
print(doc.text())
(三)PyQuery.html()获取所有子标签的html文本
from pyquery import PyQuery as pq
html = '''
<div id="aaa">
<ul class="bbb ddd">
<ccc class="item-0">first item</ccc>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>'''
doc = pq(html).find(".item-0.active")
print(doc, doc.html(), sep = "\n")
# <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
#
# <a href="link3.html"><span class="bold">third item</span></a>
六、DOM操作
(一)pq.add_class()、pq.remove_class()增删标签的class属性成员
from pyquery import PyQuery as pq
html = '''
<div id="aaa">
<ul class="bbb ddd">
<ccc class="item-0">first item</ccc>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>'''
doc = pq(html)
item = doc.find(".item-0.active")
print(item)
# <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
item.add_class("extra_class")
print(item)
# <li class="item-0 active extra_class"><a href="link3.html"><span class="bold">third item</span></a></li>
item.remove_class("active")
print(item)
# <li class="item-0 extra_class"><a href="link3.html"><span class="bold">third item</span></a></li>
(二)pq.attr(“attr”, “val”)给标签增加属性attr=“val”
item = pq('<li class="item-0 active"></li>')
print(item)
# <li class="item-0 active"/>
item.attr("id", "00852")
print(item)
# <li class="item-0 active" id="00852"/>
(三)pq.css(“val_1”, “val_2”)给标签增加style属性,style=“val_1: val——2”
item = pq('<li class="item-0 active"></li>')
print(item)
# <li class="item-0 active"/>
item.css("front-size", "14px")
print(item)
# <li class="item-0 active" style="front-size: 14px"/>
(四)pq.remove(“css-selector”)移除标签
from pyquery import PyQuery as pq
'''
提取Hello, World文本
'''
html = '''
<div class="wrap">
Hello, World
<p>This is a paragraph.</p>
</div>'''
doc = pq(html)
doc.find("p").remove()
print(doc)
# <div class="wrap">
# Hello, World
#
# </div>
print(doc.text())
# Hello, World
(五)伪类选择器
from pyquery import PyQuery as pq
html = '''
<div id="aaa">
<ul class="bbb ddd">
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>'''
doc = pq(html)
print(doc('li:first-child'))# 首个li标签
# <li class="item-1"><a href="link2.html">second item</a></li>
print(doc("li:nth-child(2)"))# 第二个li标签
# <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
print(doc("li:last-child"))# 最后一个li标签
# <li class="item-0"><a href="link5.html">fifth item</a></li>
print(doc("li:nth-child(2n)"))# 偶数个li标签
# <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
# <li class="item-0"><a href="link5.html">fifth item</a></li>