PyQuery库学习笔记
一个强大又灵活的网页解析库。正则太麻烦,BeautifulSoup语法太难记,如果熟悉 jQuery 语法,那么PyQuery就是最好的选择。
初始化
字符串初始化
html = '''
<div>
<ul>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
# 如果选id前加‘#’号,如果选class前面加'.',如果选标签名什么也不加直接选择
print(doc('li')) # doc('li')的类型是<class 'pyquery.pyquery.PyQuery'>
URL初始化
from pyquery import PyQuery as pq
doc = pq(url='http://www.baidu.com') # 直接传入url,会自动发送一个请求
print(doc('head'))
文件初始化
from pyquery import PyQuery as pq
doc = pq(filename='demo.html') # 直接指定html文本的路径,也可进行筛选
print(doc('li'))
基本CSS选择器
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
# 基本查找
print(doc('#container .list li')) # 用空格来代表一些嵌套关系
# 查找子元素
items = doc('.list')
lists = items.find('li') # 通过find方法再进行选择
print(lists)
lists = lists.children('a') # children的效果和find相同
print(lists)
# 查找父元素
items = doc('li')
container = items.parent() # 通过parent方法获取父元素
print(container)
pts = items.parents() # 反回其祖先结点,本例子中反回两个,因为向外有两层
print(pts)
pts = items.parents('#container') # 可以指定参数再选择
print(pts)
# 获取兄弟元素
li = doc('.list .item-0.active') # 这里没有空格代表并列的意思
print(li) # 符合条件的就一个
print(li.siblings()) # 用siblings方法获取兄弟元素,兄弟元素中不包含自己本身
遍历
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
lists = doc('li').items() # 通过items方法生成一个产生器
print(type(lists)) # 类型为<class 'generator'>
for i in lists: # 这种类型就可以通过for循环来遍历了
print(i)
获取信息
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
# 获取属性
a = doc('.item-0.active a')
print(a.attr('href')) # 通过attr方法获取属性
print(a.attr.href) # 这个方法的效果和上面完全相同
# 获取文本
print(a.text()) # 直接通过text方法获取标签中包含的文字
# 获取HTML
a = doc('.item-0.active')
print(a.html()) # 通过html方法获取其中的a标签
DOM操作
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
# removeClass、addClass
li.removeClass('active') # 移除这个class
print(li)
li.addClass('active') # 添加一个class
print(li)
#attr、css
li.attr('name', 'link') # 输出中增加 name="link"
print(li)
li.css('font-size', '14px') # 输出中增加 style="font-size: 14px"
print(li)
#-------------------------------------分割线------------------------------------------------#
html = '''
<div class="wrap">
Hello, World
<p>This is a paragraph.</p>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
wrap = doc('.wrap')
print(wrap.text()) # 这样输出的内容会包含p标签里面的内容
# remove
wrap.find('p').remove() # 将其中的p标签移除,这样输出的内容就没有p标签中的内容了
print(wrap.text())
伪类选择器
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('li:first-child') # 获取第一个li标签
print(li)
li = doc('li:last-child') # 获取最后哦一个li标签
print(li)
li = doc('li:nth-child(2)') # 指定索引顺序,索引从1开始,不是0
print(li)
li = doc('li:gt(2)') # 获取第二个以后的标签
print(li)
li = doc('li:nth-child(2n)') # 获取索引为偶数的标签
print(li)
li = doc('li:contains(second)') # 查找包含second这个文本的标签
print(li)
一些文档链接: