PyQuery 库学习笔记

1、初始化
1.1、字符串初始化
from pyquery import PyQuery as py

html = '''
<div>
<ul>
<li class="item-0">first item</li>
<li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li>
<li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li>
<li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li>
</ul>
</div>
'''

doc = py(html)
print (doc( 'li' ))
1.2、URL初始化
from pyquery import PyQuery as py

# 通过 URL 来获取
doc = py( url = 'http://www.baidu.com' )
# <class 'pyquery.pyquery.PyQuery'>
print ( type (doc( 'title' )))
# 输出选中的 head 标签
print (doc( 'head' ))
1.3、文件的初始化
from pyquery import PyQuery as py

# 通过文件来获取
doc = py( filename = 'demo1.html' )
# <class 'pyquery.pyquery.PyQuery'>
print ( type (doc( 'li' )))
# 输出所有的 li 标签
print (doc( 'li' ))

2、基本的CSS选择器
from pyquery import PyQuery as py

html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li>
<li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li>
<li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li>
</ul>
</div>
'''

doc = py(html)
# 选中 id container 中的 class list 中的 li 标签
print (doc( '#container .list li' ))

3、查找元素
3.1、子元素
from pyquery import PyQuery as py

html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li>
<li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li>
<li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = py(html)
# 获取 class list 的元素
items = doc( '.list' )
# <class 'pyquery.pyquery.PyQuery'>
print ( type (items))
print (items)
# 在先前找到的元素中获取 li 标签
lis = items.find( 'li' )
# <class 'pyquery.pyquery.PyQuery'>
print ( type (lis))
print (lis)
# 获取先前找到的元素中的所有子元素
lis2 = items.children()
print(type(lis2))
print(lis2)
# 获取先前找到的元素中的class为active的元素
li3 = items.children( '.active' )
print(li3)
3.2、父元素
from pyquery import PyQuery as py

html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li>
<li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li>
<li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = py(html)
# 获取 class list 的元素
items = doc( '.list' )
# 获取所选元素的父元素
container = items.parent()
print ( type (container))
print (container)
print ( "==========================" )
# 获取所选元素的所有父元素
parents = items.parents()
print ( type (parents))
print (parents)
print ( "==========================" )
# 获取所选元素的所有父元素中 class container 的元素
parent = items.parents( '.container' )
print (parent)
3.3、兄弟元素
from pyquery import PyQuery as py

html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li>
<li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li>
<li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = py(html)
# 获取 class list 的元素
items = doc( '.list' )
li = doc( '.list .item-0.active' )
# 查找选中元素的所有兄弟元素 ( 不包含自己 )
print (li.siblings())
# 查找选中元素的所有兄弟元素中 class active 的元素 ( 不包含自己 )
print (li.siblings( '.active' ))


4、遍历
4.1、单个元素
from pyquery import PyQuery as py

html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li>
<li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li>
<li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = py(html)

# 选中单个单个元素
li = doc( '.item-0.active' )
print (li)
4.2、多个元素
from pyquery import PyQuery as py

html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li>
<li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li>
<li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = py(html)

# 查找所有 li 标签
lis = doc( 'li' ).items()
# <class 'generator'>
print ( type (lis))
for li in lis:
print (li)


5、获取信息
5.1、获取属性
from pyquery import PyQuery as py

html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li>
<li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li>
<li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = py(html)
a = doc( '.item-0.active a' )
# <a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a>
print (a)
# link3.html 获取选中标签的 href 属性
print (a.attr( 'href' ))
# link3.html
print (a.attr.href)
5.2、获取文本
from pyquery import PyQuery as py

html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li>
<li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li>
<li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = py(html)
a = doc( '.item-0.active a' )
# <a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a>
print (a)
# 获取 a 标签的内容
print (a.text())
5.3、获取HTML
from pyquery import PyQuery as py

html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li>
<li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li>
<li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = py(html)
li = doc( '.item-1.active' )
# <li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li>
print (li)
# 获取 li 标签的 HTML
print (li.html())


6、DOM操作
6.1、addClass、removeClass
from pyquery import PyQuery as py

html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li>
<li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li>
<li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = py(html)
li = doc( '.item-0.active' )
print (li)
# 移除 class
li.removeClass( 'active' )
print (li)
# 添加 class
li.addClass( 'active' )
print (li)
6.2、attr、css
from pyquery import PyQuery as py

html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li>
<li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li>
<li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = py(html)
li = doc( '.item-0.active' )
print (li)
# 添加 name 属性
li.attr( 'name' , 'link' )
print (li)
# 添加 css 样式
li.css( 'font-size' , '14px' )
print (li)
6.3、remove
from pyquery import PyQuery as py

html = '''
<div class="wrap">
Hello, World
<p>This is a paragraph.</p>
</div>
'''
doc = py(html)
wrap = doc( '.wrap' )
print (wrap.text())
# 在选择的元素中找到 p 标签并移除
wrap.find( 'p' ).remove()
print (wrap.text())
6.4、其他DOM方法


7、伪类选择器
from pyquery import PyQuery as py

html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li>
<li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li>
<li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''

doc = py(html)
# 找到第一个 li
li = doc( 'li:first-child' )
print (li)
# 找到最后一个 li
li = doc( 'li:last-child' )
print (li)
# 找到第二个 li
li = doc( 'li:nth-child(2)' )
print (li)
# 找到第三个到最后的 li
li = doc( 'li:gt(2)' )
print (li)
# 找到第偶数个 li
li = doc( 'li:nth-child(2n)' )
print (li)
# 找到内容包含 second li
li = doc( 'li:contains(second)' )
print (li)
更多的选择器 


8、官方文档
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值