Python3爬虫之PyQuery库的使用

1.PyQuery库的介绍
PyQuery库是一个强大又灵活的网页解析库。如不不熟悉正则,BeautifulSoup中的语法不熟,但是熟悉jQuery的语法,那么使用PyQuery是最佳的选择。

2.PyQuery的安装
使用下面命令的前提是已经安装了anaconda,并且环境变量配置正确。
cmd命令:pip install pyquery

3.用法详解

html = '''
<div>
    <ul>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
'''
# 导入PyQuery库
from pyquery import PyQuery as pq
# 传入html,构造PyQuery对象
doc = pq(html)
# 传入选择器,PyQuery使用的是CSS选择器,就可以选择文档中满足条件的内容了
# css选择器,选择class 使用 .号,选择id使用#号,使用标签名什么都不用加
print(doc('li'))
<li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>

4.URL初始化

from pyquery import PyQuery as pq
# 传入一个url,pq会访问url,并将收到的html文档构造成一个PyQuery对象
doc = pq(url='http://www.baidu.com')
# 使用选择器进行筛选想要的内容
print(doc('head'))
<head><meta http-equiv="content-type" content="text/html;charset=utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=Edge"/><meta content="always" name="referrer"/><link rel="stylesheet" type="text/css" href="http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css"/><title>百度一下,你就知道</title></head> 

5.文件初始化

from pyquery import PyQuery as pq
# 传入一个本地html文件,文件应该和py脚本在同一路径下,不在的话指定为全路径
# 构造称为PyQuery对象
doc = pq(filename='demo.html')
print("li")

6.基本的CSS选择器

html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
# 使用CSS选择器进行筛选,前后使用空格表示层次递进关系(包含或者叫嵌套关系),不一定是直接的父子关系
# 只要包含在内都在筛选访问
print(doc('#container .list li'))
# 类型是一个PyQuery对象
print(type(doc('#container .list li')))
<li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     
<class 'pyquery.pyquery.PyQuery'>

7.查找元素

查找子元素

html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
'''
# 导入库
from pyquery import PyQuery as pq
doc = pq(html)
# 使用css 选择器,直接输入选择器
items = doc('.list')
print(type(items))
print(items)
# items是一个PyQuery对象,还可以继续使用find(选择器)查找子元素内容
# 只要在标签内的子元素或者子孙元素
lis = items.find('li')
print(type(lis))
print(lis)
<class 'pyquery.pyquery.PyQuery'>
<ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 
<class 'pyquery.pyquery.PyQuery'>
<li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>

# children()查找直接子元素
html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
'''
# 导入库
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
lis = items.children()
print(type(lis))
print(lis)
# 查找子元素中class=active的内容
lis = items.children('.active')
print(lis)
<class 'pyquery.pyquery.PyQuery'>
<li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>   

8.父元素
一个节点只有一个直接的父元素

html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
'''

from pyquery import PyQuery as pq
doc = pq(html)
# 传入css选择器,class = list
items = doc('.list')
# 获取父元素
container = items.parent()
print(type(container))
print(container)
<class 'pyquery.pyquery.PyQuery'>
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>

9.查找所有的父元素(即祖先元素)

html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
</div>
'''

from pyquery import PyQuery as pq
# 构造PyQuery对象
doc = pq(html)
# 使用css选择器找到指定元素
items = doc('.list')
# 找到 class = list元素的所有父祖先元素
parents = items.parents()
# 判断类型
print(type(parents))
# 输入内容
print(parents)
# 查找祖先元素中class=wrap的元素
parent = items.parents('.wrap')
print(parent)

<class 'pyquery.pyquery.PyQuery'>
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
</div><div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>

<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
</div>

10.兄弟元素

# 使用sliblings()查找所有的兄弟元素
html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
# 传入选择器clas=list元素中,class=item-0并且class=active的元素
li = doc('.list .item-0.active')
# 查找所有的兄弟元素
print(li.siblings())
<li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0">first item</li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>

#在sliblings()方法中传入一个css选择器,从指定元素的兄弟节点中筛选
html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
</div>
'''

from pyquery import PyQuery as pq
doc = pq(html)
# 传入选择器clas=list元素中,class=item-0并且class=active的元素
li = doc('.list .item-0.active')
# 传入css选择器,对指定元素的兄弟节点进行筛选
print(li.siblings('.active'))
<li class="item-1 active"><a href="link4.html">fourth item</a></li>    

11.遍历

单个元素

html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
</div>
'''

from pyquery import PyQuery as pq
doc = pq('html')
# 传入选择器,只检索一个的情况,css选择器中没有空格表示并列
lis = doc('.item-0.active')
print(li)
<li class="item-0"><a href="link5.html">fifth item</a></li>

12.多个元素
选择器选中多个元素,如何遍历这些元素

html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
</div>
'''

from pyquery import PyQuery as pq
doc = pq(html)
# 传入选择器,然后有多个结果的话,使用items()方法可以让结果变成生成器对象
lis = doc('li').items()
print(type(lis))
# 遍历这个列表
for li in lis:
    print(li)
<class 'generator'>
<li class="item-0">first item</li>
             
<li class="item-1"><a href="link2.html">second item</a></li>
             
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
             
<li class="item-0"><a href="link5.html">fifth item</a></li>  

13.获取信息,在获取到指定标签后,获取指定的信息和文本

获取属性

html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
</div>
'''

from pyquery import PyQuery as pq
doc = pq(html)
# 使用选择器获取指定标签
a = doc(".item-0.active a")
print(a)
# 使用attr指定属性名称,获取属性内容
print(a.attr('href'))
print(a.attr.href)
<a href="link3.html"><span class="bold">third item</span></a>
link3.html
link3.html

14.获取文本,文本指的是标签中包含的文字

html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
</div>
'''

from pyquery import PyQuery as pq
doc = pq(html)
# 使用选择器获取指定标签
a = doc(".item-0.active a")
print(a)
# 使用 标签.text()方法来获取标签的文本
print(a.text())
<a href="link3.html"><span class="bold">third item</span></a>
third item

15.获取HTML

html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
</div>
'''

from pyquery import PyQuery as pq
doc = pq(html)
# 获取指定标签
li = doc('.item-0.active')
print(li)
# 获取这个标签中的所有html,当一个文章被一个html包裹的时候,就可以使用这种方法
# 提取出整个html
print(li.html())

16.DOM操作
对指定的节点(元素)的操作
addClass, removeClass

html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
</div>
'''

from pyquery import PyQuery as pq
doc = pq(html)
# 查找到指定元素
li = doc('.item-0.active')
print(li)
# 移除指定的标签中,class=active的属性
li.removeClass('active')
print(li)
# 添加class=active属性
li.addClass('active')
print(li)
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             
<li class="item-0"><a href="link3.html"><span class="bold">third item</span></a></li>
             
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>

17.元素中 attr,css的操作
html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
</div>
'''

from pyquery import PyQuery as pq
doc = pq(html)
# 找到指定的标签
li = doc('.item-0.active')
print(li)
# 给指定的标签添加属性,有就修改,没有就添加
li.attr('name','link')
# 
print(li)
# 添加css,即style属性
li.css('font-size','14px')
print(li)
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             
<li class="item-0 active" name="link"><a href="link3.html"><span class="bold">third item</span></a></li>
             
<li class="item-0 active" name="link" style="font-size: 14px"><a href="link3.html"><span class="bold">third item</span></a></li>

18.remove
删除指定的标签

# 找到指定标签并删除
html = '''
<div class="wrap">
    Hello,World
    <p>This is a paragraph.</p>
</div>
'''

from pyquery import PyQuery as pq
doc = pq(html)
wrap = doc('.wrap')
# 打印文本,会找到所有文本,但是我们只要'Hello,World'文本
print(wrap.text())
# 使用.remove方法删除p标签内容
wrap.find('p').remove()
# 然后打印文本
print(wrap.text())

Hello,World
This is a paragraph.
Hello,World

其他DOM方法
http://pyquery.readthedocs.io/en/latest/api.html

19.伪类选择器
使用css3中的伪类选择器定位元素

html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
</div>
'''

from pyquery import PyQuery as pq
doc = pq(html)
# 查找含有li标签元素中的第一个元素
li = doc('li:first-child')
# 打印
print(li)
# 查找含有li标签元素中的最后一个标签
li = doc('li:last-child')
print(li)
# 查找含有li标签中的第二个标签
li = doc('li:nth-child(2)')
print(li)
# 查找下标高于2的标签
li = doc('li:gt(2)')
print(li)
# 查找下标序号为偶数的,查找奇数号就2n+1
li = doc('li:nth-child(2n)')
print(li)
# 查找含有li标签中,内容包含second的元素
li = doc('li:contains(second)')
print(li)
<li class="item-0">first item</li>
             
<li class="item-0"><a href="link5.html">fifth item</a></li>
         
<li class="item-1"><a href="link2.html">second item</a></li>
             
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         
<li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             
<li class="item-1"><a href="link2.html">second item</a></li>

更多CSS选择器可以看 http://www.w3school.com.cn/css/index.asp

官方文档
http://pyquery.readthedocs.io/

  • 2
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值