Python爬虫理论Pro | (6) PyQuery

在本篇博客中,我们将介绍PyQuery的用法。它是一个强大又灵活的网页解析库,如果觉得正则表达式太麻烦,BeautifulSoup语法太难记,而如果你熟悉jquery的语法,那么pyquery是最佳的选择。

安装: pip install pyquery

目录

1. 初始化

2. 基本CSS选择器

3. 遍历

4. 获取信息

5. DOM操作

6. 伪类选择器

7. 官方文档


1. 初始化

  • 字符串初始化
html = '''
<div>
    <ul>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html) #使用字符串初始化 pq对象 最常用
print(doc('li')) #通过标签名 选择所有li标签

  • URL初始化
from pyquery import PyQuery as pq
doc = pq(url='http://www.baidu.com') #通过url初始化pq对象 会自动请求url 得到html
print(doc('head'))#通过标签名 选择所有head标签
  • 文件初始化
from pyquery import PyQuery as pq
doc = pq(filename='demo.html')#通过文件初始化pq对象 
print(doc('li'))#通过标签名 选择所有li标签

 

2. 基本CSS选择器

html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('#container .list li')) # 选择所有id="container"的标签下所有class="list"的标签下的所有li标签 不一定是直接子节点 有层级关系就行

  • 查找元素

子元素:

html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list') #class="list"的所有标签
print(type(items))
print(items)
lis = items.find('li') #查找items下所有li标签(不用是直接子节点 在items下就行) 嵌套调用 
print(type(lis))
print(lis)

lis = items.children() #查找所有直接子节点
print(type(lis))
print(lis)

lis = items.children('.active') #查找items下 class="active"的所有直接子节点
print(lis)

 

父元素:

html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')#找class="list"的所有标签
container = items.parent() #找items的直接父标签
print(type(container))
print(container)

html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')#找class="list"的所有标签
parents = items.parents()#查找items的所有祖先标签
print(type(parents))
print(parents)
parent = items.parents('.wrap') #查找items 满足class="wrap"的所有祖先节点
print(parent)

兄弟元素:

html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.list .item-0.active')#查找class="list"的所有标签 下class=“item-0 active"的所有标签 没有空格是并列的意思
print(li.siblings()) #查找li的所有兄弟标签

html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.list .item-0.active')
print(li.siblings('.active'))#查找li满足class="active"的所有兄弟标签

 

3. 遍历

  • 单个元素
html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')#查找class="item-0 active"的所有标签 并列
print(li)

html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
lis = doc('li').items() #查找所有li标签 如果想对每个结果单独操作 需要先.items转化为迭代器
print(type(lis))
for li in lis: #用for循环 进行实体化 对每一个结果进行单独操作
    print(li)

4. 获取信息

  • 获取属性
html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a')#查找class="item-0 active"的所有标签 下的所有a标签
print(a)
print(a.attr('href'))#选择href属性值
print(a.attr.href) #另一种写法

  • 获取文本
html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a')
print(a)
print(a.text()) #不管a标签中包含了什么其他标签 .text()都能获取a标签包含的所有文本

  • 获取HTML
html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
print(li.html())#获取li里面的HTML代码

 

5. DOM操作

  • addClass、removeClass
html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')#查找class="item-0 active"的所有标签 
print(li)
li.removeClass('active')#去除class="active"
print(li)
li.addClass('active')#添加class="active"
print(li)

  • attr、css
html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.attr('name', 'link') #没有的话 添加属性 name="link",有的话修改name属性值
print(li)
li.css('font-size', '14px') #添加style属性
print(li)

  • remove
html = '''
<div class="wrap">
    Hello, World
    <p>This is a paragraph.</p>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
wrap = doc('.wrap')  #查找class="wrap"的所有标签
print(wrap.text())#获取标签内的所有文本
wrap.find('p').remove() #移除p标签
print(wrap.text()) #获取文本 此时就可以只获取Hello,World了

  • 其他DOM方法

http://pyquery.readthedocs.io/en/latest/api.html

 

6. 伪类选择器

html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('li:first-child') #选择第一个li标签
print(li)
li = doc('li:last-child')#选择最后一个li标签
print(li)
li = doc('li:nth-child(2)')#选择第二个li标签
print(li)
li = doc('li:gt(2)') #选择序号>=2的li标签
print(li)
li = doc('li:nth-child(2n)')#选择序号为偶数的li标签 0,2,4...
print(li)
li = doc('li:contains(second)') #选择包含second文本的li标签
print(li)

  • 更多css选择器

http://www.w3school.com.cn/css/index.asp

 

7. 官方文档

http://pyquery.readthedocs.io/

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

  • 2
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值