- 什么是PyQuery
强大又灵活的网页解析库。如果熟悉jquery,PyQuery将是绝佳的选择
- 安装PyQuery
pip3 install pyquery
- 字符串初始化
html = '''
<div>
<ul>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
print doc('li')
- URL初始化
from pyquery import PyQuery as pq
doc = pq(url='http://www.baidu.com')
print doc("head")
- 文件初始化
from pyquery import PyQuery as pq
doc = pq(filename='demo.html')
print doc("head")
- 基本的CSS选择器
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
print doc("#container .list li")
- 查找子元素
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc(".list")
print type(items)
print items
lis = items.find('li')
print type(lis)
print lis
lis = items.children('.active')
print type(lis)
print lis
- 查找父元素
from pyquery import PyQuery as pq
doc = pq(html)
items = doc(".list").parent()
print type(items)
print items
items = doc(".list").parents()
print type(items)
print items
- 查找兄弟元素
from pyquery import PyQuery as pq
doc = pq(html)
li = doc(".list .item-0.active")
print type(li.siblings())
print li.siblings()
- 遍历
from pyquery import PyQuery as pq
doc = pq(html)
lis = doc("li").items()
print type(lis)
for li in lis:
print li
- 获取属性
from pyquery import PyQuery as pq
doc = pq(html)
a = doc(".item-0.active a")
print type(a)
print a.attr('href')
print a.attr.href
- 获取文本
from pyquery import PyQuery as pq
doc = pq(html)
a = doc(".item-0.active a")
print type(a)
print a.text()
- 获取HTML
from pyquery import PyQuery as pq
doc = pq(html)
li = doc(".item-0.active")
print li
print li.html()
- DOM操作
from pyquery import PyQuery as pq
doc = pq(html)
li = doc(".item-0.active")
print li
li.removeClass('active')
print li
li.addClass('active')
print li
- 修改属性,CSS
from pyquery import PyQuery as pq
doc = pq(html)
li = doc(".item-0.active")
print li
li.attr('name','link')
print li
li.css('font-size','14px')
print li
- remove
from pyquery import PyQuery as pq
doc = pq(html)
wrap = doc('.wrap')
print wrap
print wrap.text()
wrap.find('p').remove()
print wrap.text()
- 伪类选择器
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('li:first-child')
print(li)
li = doc('li:last-child')
print(li)
li = doc('li:nth-child(2)')
print(li)
li = doc('li:gt(2)')
print(li)
li = doc('li:nth-child(2n)')
print(li)
li = doc('li:contains(second)')
print(li)