网络爬虫学习第七弹:pyquery库使用

pyquery的使用

pyquery能够很好的利用CSS选择器对网页进行解析和查询

初始化

1.字符串初始化
html='''<div>
    <ul>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>'''

from pyquery import PyQuery as pq

doc=pq(html) # 构造一个PyQuery对象,长字符串当做参数传入PyQuery类
print(doc('li')) # 将初始化对象传入CSS选择器
<li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
2.URL初始化
from pyquery import PyQuery as pq

# PyQuery对象会首先请求url,然后用的得到的HTML内容来完成初始化
doc=pq(url="http://www.cuiqingcai.com")
print(doc('title'))
<title>静觅丨崔庆才的个人博客</title>&#13;
3.文件初始化
from pyquery import PyQuery as pq
# 先读取本地文件内容,再以字符串的形式传入PyQuery对象进行解析
doc=pq(filename='test.html')
print(doc('li'))
<li class="item-0"><a href="link1.html">first item</a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-inactive"><a href="link3.html">third item</a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a>
     </li>

基本CSS选择器

html='''<div id='container'>
    <ul class='list'>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>'''

from pyquery import PyQuery as pq

doc=pq(html) # 初始化PyQuery对象
print(doc('#container .list li')) # 选取id为'container'的节点内部class为'list'的所有li节点
print(type(doc('#container .list li')))
<li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     
<class 'pyquery.pyquery.PyQuery'>

查找节点

1.子节点

find()的查找范围是节点的所有子孙节点

html='''<div id='container'>
    <ul class='list'>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>'''

from pyquery import PyQuery as pq

doc=pq(html)
items=doc('.list')
print(type(items))
print(items)
lis=items.find('li')
print(type(lis))
print(lis)
<class 'pyquery.pyquery.PyQuery'>
<ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 
<class 'pyquery.pyquery.PyQuery'>
<li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>

children()方法是查找子节点

lis=items.children()
print(type(lis))
print(lis)
<class 'pyquery.pyquery.PyQuery'>
<li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
4.父节点

parent()获取某个节点的父节点

html='''<div id='container'>
    <ul class='list'>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>'''

from pyquery import PyQuery as pq

doc=pq(html)
items=doc('li')
container=items.parent()
print(type(container))
print(container)
<class 'pyquery.pyquery.PyQuery'>
<ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>

parents()方法能够返回某个节点的祖先节点

from pyquery import PyQuery as pq

html='''<div id='container'>
    <ul class='list'>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>'''
doc=pq(html)
items=doc('li')
parents=items.parents()
print(type(parents))
print(parents)
print(parents('div')) # 筛选祖先选择器依然传入css选择器
<class 'pyquery.pyquery.PyQuery'>
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div><ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
5.兄弟节点

用siblings()方法返回该节点所有的兄弟节点

html='''<div id='container'>
    <ul class='list'>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>'''

from pyquery import PyQuery as pq

doc=pq(html)
li=doc('.list .item-0.active')
print(li.siblings())
print(li.siblings('.active')) # 挑选出兄弟节点中属性含有active的节点
<li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0">first item</li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     
<li class="item-1 active"><a href="link4.html">fourth item</a></li>

遍历

html='''<div id='container'>
    <ul class='list'>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>'''

from pyquery import PyQuery as pq

doc=pq(html)
lis=doc('li').items() # 调用items()方法得到一个生成器,再进行遍历
print(type(lis)) 
for i in lis:
    print(i)
<class 'generator'>
<li class="item-0">first item</li>
         
<li class="item-1"><a href="link2.html">second item</a></li>
         
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
         
<li class="item-0"><a href="link5.html">fifth item</a></li>

获取信息

1.获取属性
html='''<div id='container'>
    <ul class='list'>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>'''

from pyquery import PyQuery as pq

doc=pq(html)
a=doc('.item-0.active a')
print(a,type(a))
print(a.attr('href')) # 选定一个节点后,用方法attr()传入属性名称得到属性值
print(a.attr.href) # 也可以用属性attr来获取属性值
print("-------------------------------")
a=doc('a')
print(a.attr('href')) # 当要获取属性的节点有多个同名节点时,只返回第一个节点的属性值
print("-------------------------------")
for item in a.items():
    print(item.attr('href')) # 可以通过遍历的方法来获取所有a的属性
<a href="link3.html"><span class="bold">third item</span></a> <class 'pyquery.pyquery.PyQuery'>
link3.html
link3.html
-------------------------------
link2.html
-------------------------------
link2.html
link3.html
link4.html
link5.html
2.获取文本
html='''<div id='container'>
    <ul class='list'>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>'''

from pyquery import PyQuery as pq

doc=pq(html)
a=doc('.item-0.active a')
print(a)
print(a.text()) # 先获取节点再调用text()方法,返回的是节点内部的纯文本
print(a.html()) # 如果要获取节点中html文本,就要使用的html()方法
<a href="link3.html"><span class="bold">third item</span></a>
third item
<span class="bold">third item</span>
html='''<div id='container'>
    <ul class='list'>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>'''

from pyquery import PyQuery as pq

doc=pq(html)
li=doc('li')
print(li.html())
print(li.text()) # 当选择多个节点时,text()方法返回所有节点的文本并用空格连接为一个字符串
print(type(li.text())) # 而html()方法,只返回第一个节点的html文本,如果要获取全部,需要遍历获取
first item
first item second item third item fourth item fifth item
<class 'str'>

节点操作

1.addClass 和 removeClass

用于对节点中class进行移除或添加

html='''<div id='container'>
    <ul class='list'>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>'''

from pyquery import PyQuery as pq

doc=pq(html)
li=doc('.item-0.active')
print(li)
li.removeClass('active') # 删除节点中active这个class
print(li)
li.addClass('active') # 在节点中添加active这个class
print(li)
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         
<li class="item-0"><a href="link3.html"><span class="bold">third item</span></a></li>
         
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
2.attr、text 和 html

分别用于添加节点的属性,文本内容,html文本

html='''
<ul class="list">
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
</ul>'''
from pyquery import PyQuery as pq

doc=pq(html)
li=doc('.item-0.active')
print(li)
li.attr('name','link') # 添加属性name='link'
print(li)
li.text('hello word') # 将li节点内部替换为'hello word'的文本
print(li)
li.html('<span>hhhhhh</span>')# 将li节点内部替换为'<span>hhhhhh</span>'的html文本
print(li)
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>

<li class="item-0 active" name="link"><a href="link3.html"><span class="bold">third item</span></a></li>

<li class="item-0 active" name="link">hello word</li>

<li class="item-0 active" name="link"><span>hhhhhh</span></li>
3.remove()

移除某个节点

html = '''
<div class="wrap">
    Hello, World
    <p>This is a paragraph.</p>
 </div>
'''
from pyquery import PyQuery as pq

doc=pq(html)
wrap=doc('.wrap')
print(wrap.text())
wrap.find('p').remove() #去掉div节点内部的p节点,再进行文本提取
print(wrap.text())
Hello, World
This is a paragraph.
Hello, World

伪类选择器

html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div>
'''
from pyquery import PyQuery as pq

doc=pq(html)
li=doc('li:first-child') # first-child 伪类来选择元素的第一个子元素。
print(li)
li=doc('li:last-child') # last-child 伪类来选择元素的最后一个子元素。
print(li)
li=doc('li:nth-child(2)') # 选择所有li元素的父元素的第二个子元素
print(li)
li=doc('li:gt(2)') # 选中li中index值(从0开始)大于2的元素
print(li)
li=doc('li:nth-child(2n)') # 选择所有li元素的父元素的第偶数个子元素
print(li)
li=doc('li:contains(second)') # 包含某一文本的节点
print(li)
<li class="item-0">first item</li>
             
<li class="item-0"><a href="link5.html">fifth item</a></li>
         
<li class="item-1"><a href="link2.html">second item</a></li>
             
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         
<li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             
<li class="item-1"><a href="link2.html">second item</a></li>

参考:崔庆才《python3网络爬虫开发实战》

  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值