python3 2018分布式爬虫教程 -7 PyQuery 库详解

PyQuery:网页解析库,相比于BeautifulSoup语法更简单

安装命令:

pip install pyquery

pyquery 初始化对象的三种方式:

1.字符串初始化:

#coding=utf-8

from pyquery import PyQuery as pq

html='''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''
# doc 一个pyquery对象
doc = pq(html)
# 获取html中所有的 li 标签
print(doc('li'))

'''
<li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        <li class="element">Foo</li>
            <li class="element">Bar</li>
'''

2.url初始化:当请求返回为gbk编码时可以设置 encoding='gbk' 

#coding=utf-8

from pyquery import PyQuery as pq

# doc 一个pyquery对象
doc = pq(url="https://search.51job.com/list/000000,000000,0000,00,9,99,%2B,2,3.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
         ,encoding='gbk')
# 获取html中所有的 li 标签
print(doc('title'))


'''
<title>【全国招聘,求职】-前程无忧</title>
'''

3.文本初始化:

#coding=utf-8

from pyquery import PyQuery as pq
doc = pq(filename='demo.html')
print(doc('li'))

'''
<li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
'''

4.基本CSS选择器

. 代表class

# 代表id

中间以空格隔开

当一个 class 或者 id 的内容有空格时:比如:class="item-1 active"

获取元素时 用: .item-1.active    (注意中间没有空格

# coding=utf-8

from pyquery import PyQuery as pq

html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
'''
doc = pq(html)
print(doc('#container .item-0.active a'))

'''
<a href="link3.html"><span class="bold">third item</span></a>
'''

5.查找元素:

查找子元素: doc.find(‘li’) 查找所有子元素为 li 标签的元素

查找子元素: doc.children() 查找所有子元素

查找子元素: doc.children(‘a’) 查找所有子元素为 a 标签的元素

# coding=utf-8

from pyquery import PyQuery as pq

html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
         <a href="link4.html">fourth item</a>
     </ul>
 </div>
'''
doc = pq(html)
item = doc('#container .list')
print(item.find('li'))
print("----------------------------")
print(item.children('a'))
print("----------------------------")
print(item.children())

'''
<li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
         
----------------------------
<a href="link4.html">fourth item</a>
     
----------------------------
<li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
         <a href="link4.html">fourth item</a>
'''

查找父元素:

items.parent(): 查找直接父元素

items.parent(): 查找直接父元素

items.parents():查找所有祖先元素

items.parents('.wrap'):查找指定祖先元素

# coding=utf-8

from pyquery import PyQuery as pq

html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
         <a href="link4.html">fourth item</a>
     </ul>
 </div>
'''
doc = pq(html)
item = doc('#container .list .item-0')
print(item.parent())
print("----------------------------")
print(item.parents())
print("----------------------------")
print(item.parents('.list'))

'''
<ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
         <a href="link4.html">fourth item</a>
     </ul>
 
----------------------------
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
         <a href="link4.html">fourth item</a>
     </ul>
 </div><ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
         <a href="link4.html">fourth item</a>
     </ul>
 
----------------------------
<ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
         <a href="link4.html">fourth item</a>
     </ul>
'''

查找兄弟节点:

siblings():查找所有兄弟节点

siblings('.item-0'):查找所有 class 为 'item-0' 的兄弟节点

# coding=utf-8

from pyquery import PyQuery as pq

html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
         <a href="link4.html">fourth item</a>
     </ul>
 </div>
'''
doc = pq(html)
item = doc('#container .list a')
print(item.siblings())
print("-------------------------------")
print(item.siblings('.item-0'))


'''
<li class="item-0"><a href="link5.html">fifth item</a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0">first item</li>
         
-------------------------------
<li class="item-0"><a href="link5.html">fifth item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-0">first item</li>
'''

6.遍历:

# coding=utf-8

from pyquery import PyQuery as pq

html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
         <a href="link4.html" id="test">fourth item</a>
     </ul>
 </div>
'''
doc = pq(html)
item = doc('#container .list li').items()
for one in item:
    print(one)

'''
<li class="item-0">first item</li>
         
<li class="item-1"><a href="link2.html">second item</a></li>
         
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
         
<li class="item-0"><a href="link5.html">fifth item</a></li>
'''

7.获取信息:

获取属性:

获取文本:

获取html:

# coding=utf-8

from pyquery import PyQuery as pq

html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
         <a href="link4.html" id="test">fourth item</a>
     </ul>
 </div>
'''
doc = pq(html)
item = doc('#container .list .item-0')
a = item('a')
print(a.attr.href)
print(a.attr('href'))
print('----------------------------------')
print(a.text())
print('----------------------------------')
print(a.html())

'''
link3.html
link3.html
----------------------------------
third item fifth item
----------------------------------
<span class="bold">third item</span>
'''

8.dom操作:

addClass、removeClass:此操作是永久失效

# coding=utf-8

from pyquery import PyQuery as pq

html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
         <a href="link4.html" id="test">fourth item</a>
     </ul>
 </div>
'''
doc = pq(html)
li = doc('.item-0.active')
print(li.removeClass('active'))
print(li)
print("------------------------------")
print(li.addClass('sub'))
print(li)


'''
<li class="item-0"><a href="link3.html"><span class="bold">third item</span></a></li>
         
<li class="item-0"><a href="link3.html"><span class="bold">third item</span></a></li>
         
------------------------------
<li class="item-0 sub"><a href="link3.html"><span class="bold">third item</span></a></li>
         
<li class="item-0 sub"><a href="link3.html"><span class="bold">third item</span></a></li>
'''

9.增加,修改,删除属性:attr、css

增减或者修改属性: a.attr('id','test')        添加 id 属性

增减或者修改属性: a.css('id','test')        添加 id 属性

删除属性:a.remove_attr('class')             删除 class

# coding=utf-8

from pyquery import PyQuery as pq

html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
         <a class="item" href="link4.html" id="test">fourth item</a>
     </ul>
 </div>
'''
doc = pq(html)
a = doc('.list .item')
print(a.attr('id','test'))
print(a.remove_attr('id'))
print("------------------------------")
print(a.attr('class','test'))
print(a.remove_attr('class'))


'''
<a class="item" href="link4.html" id="test">fourth item</a>
     
<a class="item" href="link4.html">fourth item</a>
     
------------------------------
<a class="test" href="link4.html">fourth item</a>
     
<a href="link4.html">fourth item</a>
'''

10.删除标签

# coding=utf-8

from pyquery import PyQuery as pq

html = '''
<div class="sss">
    Hello, World
    <p>This is a paragraph.</p>
 </div>
'''

doc = pq(html)
div_text = doc('.sss')
print(div_text.text())
print("-------------------------")
print(div_text('p').remove())
print(div_text.text())


'''
Hello, World
This is a paragraph.
-------------------------
<p>This is a paragraph.</p>
 
Hello, World

'''

after()在节点后添加值

before()在节点之前插入值

append()将值添加到每个节点

contents()返回文本节点内容

empty()删除节点内容

val()设置或获取属性值

其他DOM方法:http://pyquery.readthedocs.io/en/latest/api.html

11.伪类选择器

# coding=utf-8

from pyquery import PyQuery as pq

html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div>
'''
doc = pq(html)
li = doc('li:first-child')
print(li)
li = doc('li:last-child')
print(li)
li = doc('li:nth-child(2)')
print(li)
li = doc('li:gt(2)')
print(li)
li = doc('li:nth-child(2n)')
print(li)
li = doc('li:contains(second)')
print(li)


'''
<li class="item-0">first item</li>
             
<li class="item-0"><a href="link5.html">fifth item</a></li>
         
<li class="item-1"><a href="link2.html">second item</a></li>
             
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         
<li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             
<li class="item-1"><a href="link2.html">second item</a></li>
'''

 

 

 

 

 

 

 

 

 

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Toroidals

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值