PyQuery

PyQuery

强大又灵活的网页解析库。如果你觉得正则写起来太麻烦,如果你觉得BeautifulSoup语法太难记,如果你熟悉jQuery的语法,那么PyQuery就是你的最佳选择。

字符串初始化

html = '''
<div id="container">
    <ul class="list">
        <li class="item-0">first item</li>
        <li class="item-1"><a href="link2.html">second item</a></li>
        <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
        <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        <li class="item-0"><a href="link5.html">fifth item</a></li>
    </ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html) # 声明pyquery对象
print(doc('li'))   #同样使用CSS选择器,规则相似

输出

<li class="item-0">first item</li>
        <li class="item-1"><a href="link2.html">second item</a></li>
        <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
        <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        <li class="item-0"><a href="link5.html">fifth item</a></li>

URL初始化

from pyquery import PyQuery as pq
doc = pq(url="http://www.baidu.com")  #自动请求链接,并返回html
#另外还可以通过向filename传递参数进行文件初始化
print(doc('head'))

输出

<head><meta http-equiv="content-type" content="text/html;charset=utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=Edge"/><meta content="always" name="referrer"/><link rel="stylesheet" type="text/css" href="http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css"/><title>ç™¾åº¦ä¸€ä¸‹ï¼Œä½ å°±çŸ¥é“</title></head>

基本CSS选择器

from pyquery import PyQuery as pq
doc = pq(html)  # html见前例
print(doc("#container .list li"))  
#之间不一定非要有父子关系,只需要有层级关系
<li class="item-0">first item</li>
        <li class="item-1"><a href="link2.html">second item</a></li>
        <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
        <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        <li class="item-0"><a href="link5.html">fifth item</a></li>

查找元素

1.子元素

from pyquery import PyQuery as pq
doc = pq(html)
items = doc(".list")
print(type(items))
print(items)
lis = items.find("li")  # 所有结果
print(type(lis))
print(lis)

输出

<class 'pyquery.pyquery.PyQuery'>   
#为pyquery对象说明可以调用与之相关的一切方法
<ul class="list">
        <li class="item-0">first item</li>
        <li class="item-1"><a href="link2.html">second item</a></li>
        <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
        <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        <li class="item-0"><a href="link5.html">fifth item</a></li>
    </ul>
<class 'pyquery.pyquery.PyQuery'>
<li class="item-0">first item</li>
        <li class="item-1"><a href="link2.html">second item</a></li>
        <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
        <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        <li class="item-0"><a href="link5.html">fifth item</a></li>
from pyquery import PyQuery as pq
doc = pq(html)
items = doc(".list")
lis = items.children()  
# 查找直接子元素,也可以向children中传递参数用以筛选
print(items)
print(lis)
<ul class="list">
        <li class="item-0">first item</li>
        <li class="item-1"><a href="link2.html">second item</a></li>
        <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
        <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        <li class="item-0"><a href="link5.html">fifth item</a></li>
    </ul>

<li class="item-0">first item</li>
        <li class="item-1"><a href="link2.html">second item</a></li>
        <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
        <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        <li class="item-0"><a href="link5.html">fifth item</a></li>

2.父元素

from pyquery import PyQuery as pq
doc = pq(html)
items = doc(".list")
container = items.parent()  #有且仅有一个父节点
print(type(container))
print(container)

输出

<class 'pyquery.pyquery.PyQuery'>
<div id="container">
    <ul class="list">
        <li class="item-0">first item</li>
        <li class="item-1"><a href="link2.html">second item</a></li>
        <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
        <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        <li class="item-0"><a href="link5.html">fifth item</a></li>
    </ul>
</div>
html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc(".list")
parents = items.parents()
#所有的祖先节点,每次输出一遍,可传入CSS选择器再进行筛选
print(type(parents))
print(parents)

输出

<class 'pyquery.pyquery.PyQuery'>
<div class="wrap">
    <div id="container">
        <ul class="list">
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
</div><div id="container">
        <ul class="list">
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>

3.兄弟元素

from pyquery import PyQuery as pq
doc = pq(html)
li = doc(".list .item-0.active")   
#前面class=list代表要在此标签中寻找,
#后面两个class之间无空格代表要求class同时满足item-0与active
print(li.siblings())  #此时会输出除筛选标签以外的所有兄弟元素,
#还可以传入参数,例如”active”,则会在结果中筛选带有active的兄弟元素

输出

<li class="item-1"><a href="link2.html">second item</a></li>
        <li class="item-0">first item</li>
        <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        <li class="item-0"><a href="link5.html">fifth item</a></li>

遍历

from pyquery import PyQuery as pq
doc = pq(html)
li = doc("li").items()
print(type(li))
for i in li:
    print(i)  #每一个i又是一个pyquery元素,可以使用pyquery方法

输出

<class 'generator'>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>

获取信息

获取属性

from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a')  #空格表示a位于前者之内
print(a)
print(a.attr('href'))   #获取属性
print(a.attr.href)   #获取属性

输出

<a href="link3.html"><span class="bold">third item</span></a>
link3.html
link3.html

获取文本

from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a')
print(a)
print(a.text())  # 选中文字

输出

<a href="link3.html"><span class="bold">third item</span></a>
third item

获取html

from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active')
print(a)
print(a.html()) #除去li标签,剩下的内容

输出

<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<a href="link3.html"><span class="bold">third item</span></a>

DOM操作

addClass、removeClass

from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.remove_class('active')  # 移除
print(li)
li.add_class('active')  # 增添
print(li)

输出

<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>

attr、css

from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.attr('name','link')   #若name属性不存在,则会添加;
                         #若已存在,则改变
print(li)
li.css('font-size','14px')   #添加style属性
print(li)
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0 active" name="link"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0 active" name="link" style="font-size: 14px"><a href="link3.html"><span class="bold">third item</span></a></li>

remove等等其他DOM方法

伪类选择器

from pyquery import PyQuery as pq
doc = pq(html)
li1 = doc('li:first-child')  #第一个子标签
li2 = doc('li:last-child')   #最后一个子标签
li3 = doc('li:nth-child(2)')   #索引为2的子标签
li4 = doc('li:gt(2)')  #索引比2大的标签,从0计数
li5 = doc('li:nth_child(2n)')   #索引为偶数的子标签
li6 = doc('li:contains(second)')    #子标签中含有second字样的标签
print(li1,"\n",li2,"\n",li3,"\n",li4,"\n",li5,"\n",li6)

输出

<li class="item-0">first item</li>
         
 <li class="item-0"><a href="link5.html">fifth item</a></li>
     
 <li class="item-1"><a href="link2.html">second item</a></li>
         
 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        <li class="item-0"><a href="link5.html">fifth item</a></li>
     
 <li class="item-1"><a href="link2.html">second item</a></li>
        <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         
 <li class="item-1"><a href="link2.html">second item</a></li>
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值