Python爬虫常用库(三)pyquery

一、初始化
(一)html代码初始化

from pyquery import PyQuery as pq

html = '''
<div>
    <ul>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
</div>'''
doc = pq(html)
print(doc("div"))
# <div>
#     <ul>
#          <li class="item-0">first item</li>
#          <li class="item-1"><a href="link2.html">second item</a></li>
#          <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
#          <li class="item-1 active"><a href="link4.html">fourth item</a></li>
#          <li class="item-0"><a href="link5.html">fifth item</a></li>
#      </ul>
# </div>

(二)url初始化

doc = pq(url="http://www.baidu.com")
print(doc("html"))
# <html> <head><meta http-equiv=

(三)文件初始化

from pyquery import PyQuery as pq

file_name = "html.txt"
doc = pq(filename=file_name)
print(doc("div"))

二、返回的pyquery对象可以接受CSS选择器参数

from pyquery import PyQuery as pq

html = '''
<div id="aaa">
    <ul class="bbb">
         <ccc class="item-0">first item</ccc>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
</div>'''

doc = pq(html)
print(doc("#aaa .bbb ccc"))
# <ccc class="item-0">first item</ccc>

三、查找元素
1、Pyquery.find()查找所有子元素

doc = pq(doc)

item = doc("#aaa")
print(item)

sub_item = item(".bbb")
print(sub_item)

lis = sub_item.find("li")
print(lis)

2、Pyquery.children()查找所有直接子元素

from pyquery import PyQuery as pq

html = '''
<div id="aaa">
    <ul class="bbb">
         <ccc class="item-0">first item</ccc>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
</div>'''

doc = pq(html)
print(doc.find(".bbb").children(".active"))
# <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
#          <li class="item-1 active"><a href="link4.html">fourth item</a></li>
#          

3、PyQuery.parent()查找直接父元素,由于父元素唯一,因此可以不提供参数

from pyquery import PyQuery as pq

html = '''
<div id="aaa">
    <ul class="bbb">
         <ccc class="item-0">first item</ccc>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
</div>'''

doc = pq(html)
item = doc.find("ccc")
print(item.parents(".bbb"))

4、PyQuery.siblngs()查找所有的兄弟元素

from pyquery import PyQuery as pq

html = '''
<div id="aaa">
    <ul class="bbb">
         <ccc class="item-0">first item</ccc>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
</div>'''

doc = pq(html)
item = doc.find("ccc")
print(item.siblings(".item-0.active"))
# <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>

四、遍历
含有多个元素的PyQuery()对象,可以使用.items()获取到一个生成器类型,从而for语法遍历所有元素

doc = pq(html)
for item in doc.find("li").items():
    print(item)
# <li class="item-1"><a href="link2.html">second item</a></li>
#
# <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
#
# <li class="item-1 active"><a href="link4.html">fourth item</a></li>
#
# <li class="item-0"><a href="link5.html">fifth item</a></li>

五、获取信息
(一)PyQuery.attr[“属性名”], PyQuery.attr.属性名获取属性

doc = pq(html)
item = doc.find("ccc")
print(item.attr.class_)
print(item.attr["class"])

(二)PyQuery.text()获取标签内文本

doc = pq('<ccc class="item-0">first item</ccc>')
print(doc.text())

(三)PyQuery.html()获取所有子标签的html文本

from pyquery import PyQuery as pq

html = '''
<div id="aaa">
    <ul class="bbb ddd">
         <ccc class="item-0">first item</ccc>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
</div>'''
doc = pq(html).find(".item-0.active")
print(doc, doc.html(), sep = "\n")
# <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
#          
# <a href="link3.html"><span class="bold">third item</span></a>

六、DOM操作
(一)pq.add_class()、pq.remove_class()增删标签的class属性成员

from pyquery import PyQuery as pq

html = '''
<div id="aaa">
    <ul class="bbb ddd">
         <ccc class="item-0">first item</ccc>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
</div>'''

doc = pq(html)
item = doc.find(".item-0.active")
print(item)
# <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
item.add_class("extra_class")
print(item)
# <li class="item-0 active extra_class"><a href="link3.html"><span class="bold">third item</span></a></li>
item.remove_class("active")
print(item)
# <li class="item-0 extra_class"><a href="link3.html"><span class="bold">third item</span></a></li>

(二)pq.attr(“attr”, “val”)给标签增加属性attr=“val”

item = pq('<li class="item-0 active"></li>')
print(item)
# <li class="item-0 active"/>
item.attr("id", "00852")
print(item)
# <li class="item-0 active" id="00852"/>

(三)pq.css(“val_1”, “val_2”)给标签增加style属性,style=“val_1: val——2”

item = pq('<li class="item-0 active"></li>')
print(item)
# <li class="item-0 active"/>
item.css("front-size", "14px")
print(item)
# <li class="item-0 active" style="front-size: 14px"/>

(四)pq.remove(“css-selector”)移除标签

from pyquery import PyQuery as pq

'''
提取Hello, World文本
'''

html = '''
<div class="wrap">
    Hello, World
    <p>This is a paragraph.</p>
 </div>'''

doc = pq(html)
doc.find("p").remove()
print(doc)
# <div class="wrap">
#     Hello, World
#      
#  </div>
print(doc.text())
# Hello, World

(五)伪类选择器

from pyquery import PyQuery as pq

html = '''
<div id="aaa">
    <ul class="bbb ddd">
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
</div>'''

doc = pq(html)
print(doc('li:first-child'))# 首个li标签
# <li class="item-1"><a href="link2.html">second item</a></li>
print(doc("li:nth-child(2)"))# 第二个li标签
# <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
print(doc("li:last-child"))# 最后一个li标签
# <li class="item-0"><a href="link5.html">fifth item</a></li>
print(doc("li:nth-child(2n)"))# 偶数个li标签
# <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
#          <li class="item-0"><a href="link5.html">fifth item</a></li>
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值