PyQuery详解

一:安装pyquery

pip install pyquery

二:初始化

1,字符串初始化

html='''
<div>
  <ul>
    <li class="item-0">first item</li>
    <li class="item-1"><a href="link2.html">second item</a></li>
    <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
    <li class="item-1 active"><a href="link4.html">fourth item</a></li>
    <li class="item-0"><a href="link5.html">fifth item</a></li>
  </ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('li'))

2,URL初始化

from pyquery im其port PyQuery as pq
doc = pq(url='http://www.baidu.com')
print(doc('head'))

3,文件初始化

from pyquery import PyQuery as pq
doc = pq(filename='demo.html')
print(doc('li')) #'li'为选择器

三:基本CSS选择器选择器

html='''
<div id="container">
  <ul class="list">
    <li class="item-0">first item</li>
    <li class="item-1"><a href="link2.html">second item</a></li>
    <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
    <li class="item-1 active"><a href="link4.html">fourth item</a></li>
    <li class="item-0"><a href="link5.html">fifth item</a></li>
  </ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('#container .list li')) #查找id为container里面的class为list的li标签

四:查找元素

1,查找子元素
html='''
<div id="container">
  <ul class="list">
    <li class="item-0">first item</li>
    <li class="item-1"><a href="link2.html">second item</a></li>
    <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
    <li class="item-1 active"><a href="link4.html">fourth item</a></li>
    <li class="item-0"><a href="link5.html">fifth item</a></li>
  </ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
print(type(items))
print(items)
lis = items.find('li')   #find查找当前items元素里面的'li'
print(type(lis))
print(lis)
lis = items.children() #children查找所有直接子元素
print(type(lis))
print(lis)
lis = items.children('.active')
print(lis)# 因为'list'的父元素只有一个,所以用parent
2,查找父元素
html='''
<div id="container">
  <ul class="list">
    <li class="item-0">first item</li>
    <li class="item-1"><a href="link2.html">second item</a></li>
    <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
    <li class="item-1 active"><a href="link4.html">fourth item</a></li>
    <li class="item-0"><a href="link5.html">fifth item</a></li>
  </ul># 因为'list'的父元素只有一个,所以用parent
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
container = items.parent() # 因为'list'的父元素只有一个,所以用parent
print(type(container))
print(container)
html='''
<div class="wrap">
  <div id="container">
    <ul class="list">
      <li class="item-0">first item</li>
      <li class="item-1"><a href="link2.html">second item</a></li>
      <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
      <li class="item-1 parentactive"><a href="link4.html">fourth item</a></li>
      <li class="item-0"><a href="link5.html">fifth item</a></li>
    </ul>
  </div>
</div>
'''
from pyquery import PyQuery as pq
doc = konggepq(html)
items = doc('.list')parent
container = items.parents() # 因为'list'的父元素不只一个,所以用parents
print(type(parents))
print(parents)
parent = items.parents('.wrap')
print(parent)
3,查找兄弟元素li
html='''
<div class="wrap">
  <div id="container">
    <ul class="list">
      <li class="item-0">first item</li>
      <li class="item-1"><a href="link2.html">second item</a></li>
      <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
      <li class="item-1 parentactive"><a href="link4.html">fourth item</a></li>
      <li class="item-0"><a href="link5.html">fifth item</a></li>
    </ul>
  </div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.list .item-0.active') #选择'.list'里面的'.item-0.active'标签
#li = doc('.list.item-0.active') #同时选择'.list'与'.item-0.active'标签.区别在于两标签之间有空格
print(li.siblings()) # siblings()获取所有兄弟节点
html='''
<div class="wrap">
  <div id="container">
    <ul class="list">
      <li class="item-0">first item</li>
      <li class="item-1"><a href="link2.html">second item</a></li>
      <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
      <li class="item-1 parentactive"><a href="link4.html">fourth item</a></li>
      <li class="item-0"><a href="link5.html">fifth item</a></li>
    </ul>
  </div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.list .item-0.active')
print(li.siblings('.active'))

五,遍历

html='''
<div class="wrap">
  <div id="container">
    <ul class="list">
      <li class="item-0">first item</li>
      <li class="item-1"><a href="link2.html">second item</a></li>
      <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
      <li class="item-1 parentactive"><a href="link4.html">fourth item</a></li>
      <li class="item-0"><a href="link5.html">fifth item</a></li>
    </ul>
  </div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
lis = doc('li').items()
print(type(lis))
for li in lis:li
print(li)

六:获取信息

1,获取属性
html='''
<div class="wrap">
  <div id="container">
    <ul class="list">
      <li class="item-0">first item</li>
      <li class="item-1"><a href="link2.html">second item</a></li>
      <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
      <li class="item-1 parentactive"><a href="link4.html">fourth item</a></li>
      <li class="item-0"><a href="link5.html">fifth item</a></li>
    </ul>
  </div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a') #a前面的空格表示里面的a标签
print(a)
print(a.attr('href'))
print(a.attr.href) # 结果同上
2,获取文本
html='''
<div class="wrap">
  <div id="container">
    <ul class="list">
      <ltexti class="item-0">first item</li>
      <leni class="item-1"><a href="link2.html">second item</a></li>
      <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
      <li class="item-1 parentactive"><a href="link4.html">fourth item</a></li>
      <li class="item-0"><a href="link5.html">fifth item</a></li>
    </ul>
  </div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a') #a前面的空格表示里面的a标签
print(a)
print(a.text())  # 获取文本
3,获取HTML
html='''
<div class="wrap">
  <div id="container">
    <ul class="list">
      <ltexti class="item-0">first item</li>
      <li class="item-1"><a href="link2.html">second item</a></li>
      <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
      <li class="item-1 parentactive"><a href="link4.html">fourth item</a></li>
      <li class="item-0"><a href="link5.html">fifth item</a></li>
    </ul>
  </div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a') #a前面的空格表示里面的a标签
print(a)
print(a.html())  # 获取html

七:DOM操作(节点操作)

1,addClass添加标签 removeClass移除标签
html='''
<div class="wrap">
  <div id="container">
    <ul class="list">
      <ltexti class="item-0">first item</li>
      <li class="item-1"><a href="link2.html">second item</a></li>
      <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
      <li class="item-1 parentactive"><a href="link4.html">fourth item</a></li>
      <li class="item-0"><a href="link5.html">fifth item</a></li>
    </ul>
  </div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.removeClass('active') #移除active标签
print(li)
li.addClass('active') #添加active标签
print(li)
2,attr css
html='''
<div class="wrap">
  <div id="container">
    <ul class="list">
      <ltexti class="item-0">first item</li>
      <li class="item-1"><a href="link2.html">second item</a></li>
      <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
      <li class="item-1 parentactive"><a href="link4.html">fourth item</a></li>
      <li class="item-0"><a href="link5.html">fifth item</a></li>
    </ul>
  </div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.attr('name','link') #attr把name=link属性添加覆盖到li标签
print(li)
li.css('font-size',14px) #css把style=font-size:14px的属性添加到li标签
print(li)
3, remove
html = '''
<div class="wrap">
  Hello,World
  <p>This is a paragraph.</p>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
wrap = doc('.wrap')
print(wrap.text())
wrap.find('p').remove() #remove()移除p标签,以便下一步打印Hello,World
print(wrap.text())
4,其他DOM方法

http://pyquery.readthedocs.io/en/latest/api/.html

八:伪类选择器

html='''
<div class="wrap">
  <div id="container">
    <ul class="list">
      <ltexti class="item-0">first item</li>
      <li class="item-1"><a href="link2.html">second item</a></li>
      <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
      <li class="item-1 parentactive"><a href="link4.html">fourth item</a></li>
      <li class="item-0"><a href="link5.html">fifth item</a></li>
    </ul>
  </div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('li:first-child') #选择li标签中的第一个子标签
print(li)
li = doc('li:last-chlid')  #选择li标签中的最后一个子标签
print(li)
li = doc('li:nth-chlid(2)')  #nth-chlid(2)指定选择li标签中第二个子标签
print(li)
li = doc('li:gt(2)') # 选择序号比2大的标签
print(li)
li = doc('li:nth-chlid(2n)') # nth-chlid(2n)选择偶数标签
print(li)
li = doc('li:contains(second)') # 查找包含second文本的标签
print(li)bb

更多CSS选择器可以查看http://www.w3school.com.cn/css/index.asp

八:官方文档

http://pyquery.readthedocs.io/

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值