PyQuery:网页解析库,相比于BeautifulSoup语法更简单
安装命令:
pip install pyquery
pyquery 初始化对象的三种方式:
1.字符串初始化:
#coding=utf-8
from pyquery import PyQuery as pq
html='''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
# doc 一个pyquery对象
doc = pq(html)
# 获取html中所有的 li 标签
print(doc('li'))
'''
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
<li class="element">Foo</li>
<li class="element">Bar</li>
'''
2.url初始化:当请求返回为gbk编码时可以设置 encoding='gbk'
#coding=utf-8
from pyquery import PyQuery as pq
# doc 一个pyquery对象
doc = pq(url="https://search.51job.com/list/000000,000000,0000,00,9,99,%2B,2,3.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
,encoding='gbk')
# 获取html中所有的 li 标签
print(doc('title'))
'''
<title>【全国招聘,求职】-前程无忧</title>
'''
3.文本初始化:
#coding=utf-8
from pyquery import PyQuery as pq
doc = pq(filename='demo.html')
print(doc('li'))
'''
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
'''
4.基本CSS选择器
. 代表class
# 代表id
中间以空格隔开
当一个 class 或者 id 的内容有空格时:比如:class="item-1 active"
获取元素时 用: .item-1.active (注意中间没有空格)
# coding=utf-8
from pyquery import PyQuery as pq
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
doc = pq(html)
print(doc('#container .item-0.active a'))
'''
<a href="link3.html"><span class="bold">third item</span></a>
'''
5.查找元素:
查找子元素: doc.find(‘li’) 查找所有子元素为 li 标签的元素
查找子元素: doc.children() 查找所有子元素
查找子元素: doc.children(‘a’) 查找所有子元素为 a 标签的元素
# coding=utf-8
from pyquery import PyQuery as pq
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
<a href="link4.html">fourth item</a>
</ul>
</div>
'''
doc = pq(html)
item = doc('#container .list')
print(item.find('li'))
print("----------------------------")
print(item.children('a'))
print("----------------------------")
print(item.children())
'''
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
----------------------------
<a href="link4.html">fourth item</a>
----------------------------
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
<a href="link4.html">fourth item</a>
'''
查找父元素:
items.parent(): 查找直接父元素
items.parent(): 查找直接父元素
items.parents():查找所有祖先元素
items.parents('.wrap'):查找指定祖先元素
# coding=utf-8
from pyquery import PyQuery as pq
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
<a href="link4.html">fourth item</a>
</ul>
</div>
'''
doc = pq(html)
item = doc('#container .list .item-0')
print(item.parent())
print("----------------------------")
print(item.parents())
print("----------------------------")
print(item.parents('.list'))
'''
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
<a href="link4.html">fourth item</a>
</ul>
----------------------------
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
<a href="link4.html">fourth item</a>
</ul>
</div><ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
<a href="link4.html">fourth item</a>
</ul>
----------------------------
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
<a href="link4.html">fourth item</a>
</ul>
'''
查找兄弟节点:
siblings():查找所有兄弟节点
siblings('.item-0'):查找所有 class 为 'item-0' 的兄弟节点
# coding=utf-8
from pyquery import PyQuery as pq
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
<a href="link4.html">fourth item</a>
</ul>
</div>
'''
doc = pq(html)
item = doc('#container .list a')
print(item.siblings())
print("-------------------------------")
print(item.siblings('.item-0'))
'''
<li class="item-0"><a href="link5.html">fifth item</a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0">first item</li>
-------------------------------
<li class="item-0"><a href="link5.html">fifth item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0">first item</li>
'''
6.遍历:
# coding=utf-8
from pyquery import PyQuery as pq
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
<a href="link4.html" id="test">fourth item</a>
</ul>
</div>
'''
doc = pq(html)
item = doc('#container .list li').items()
for one in item:
print(one)
'''
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
'''
7.获取信息:
获取属性:
获取文本:
获取html:
# coding=utf-8
from pyquery import PyQuery as pq
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
<a href="link4.html" id="test">fourth item</a>
</ul>
</div>
'''
doc = pq(html)
item = doc('#container .list .item-0')
a = item('a')
print(a.attr.href)
print(a.attr('href'))
print('----------------------------------')
print(a.text())
print('----------------------------------')
print(a.html())
'''
link3.html
link3.html
----------------------------------
third item fifth item
----------------------------------
<span class="bold">third item</span>
'''
8.dom操作:
addClass、removeClass:此操作是永久失效
# coding=utf-8
from pyquery import PyQuery as pq
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
<a href="link4.html" id="test">fourth item</a>
</ul>
</div>
'''
doc = pq(html)
li = doc('.item-0.active')
print(li.removeClass('active'))
print(li)
print("------------------------------")
print(li.addClass('sub'))
print(li)
'''
<li class="item-0"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0"><a href="link3.html"><span class="bold">third item</span></a></li>
------------------------------
<li class="item-0 sub"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0 sub"><a href="link3.html"><span class="bold">third item</span></a></li>
'''
9.增加,修改,删除属性:attr、css
增减或者修改属性: a.attr('id','test') 添加 id 属性
增减或者修改属性: a.css('id','test') 添加 id 属性
删除属性:a.remove_attr('class') 删除 class
# coding=utf-8
from pyquery import PyQuery as pq
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
<a class="item" href="link4.html" id="test">fourth item</a>
</ul>
</div>
'''
doc = pq(html)
a = doc('.list .item')
print(a.attr('id','test'))
print(a.remove_attr('id'))
print("------------------------------")
print(a.attr('class','test'))
print(a.remove_attr('class'))
'''
<a class="item" href="link4.html" id="test">fourth item</a>
<a class="item" href="link4.html">fourth item</a>
------------------------------
<a class="test" href="link4.html">fourth item</a>
<a href="link4.html">fourth item</a>
'''
10.删除标签
# coding=utf-8
from pyquery import PyQuery as pq
html = '''
<div class="sss">
Hello, World
<p>This is a paragraph.</p>
</div>
'''
doc = pq(html)
div_text = doc('.sss')
print(div_text.text())
print("-------------------------")
print(div_text('p').remove())
print(div_text.text())
'''
Hello, World
This is a paragraph.
-------------------------
<p>This is a paragraph.</p>
Hello, World
'''
after()在节点后添加值
before()在节点之前插入值
append()将值添加到每个节点
contents()返回文本节点内容
empty()删除节点内容
val()设置或获取属性值
其他DOM方法:http://pyquery.readthedocs.io/en/latest/api.html
11.伪类选择器
# coding=utf-8
from pyquery import PyQuery as pq
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
doc = pq(html)
li = doc('li:first-child')
print(li)
li = doc('li:last-child')
print(li)
li = doc('li:nth-child(2)')
print(li)
li = doc('li:gt(2)')
print(li)
li = doc('li:nth-child(2n)')
print(li)
li = doc('li:contains(second)')
print(li)
'''
<li class="item-0">first item</li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
'''