pip install pyquery
二:初始化
1,字符串初始化
html=''' <div> <ul> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> ''' from pyquery import PyQuery as pq doc = pq(html) print(doc('li'))
2,URL初始化
from pyquery im其port PyQuery as pq doc = pq(url='http://www.baidu.com') print(doc('head'))
3,文件初始化
from pyquery import PyQuery as pq doc = pq(filename='demo.html') print(doc('li')) #'li'为选择器
三:基本CSS选择器选择器
html=''' <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> ''' from pyquery import PyQuery as pq doc = pq(html) print(doc('#container .list li')) #查找id为container里面的class为list的li标签
四:查找元素
1,查找子元素
html=''' <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> ''' from pyquery import PyQuery as pq doc = pq(html) items = doc('.list') print(type(items)) print(items) lis = items.find('li') #find查找当前items元素里面的'li' print(type(lis)) print(lis)
lis = items.children() #children查找所有直接子元素 print(type(lis)) print(lis)
lis = items.children('.active') print(lis)# 因为'list'的父元素只有一个,所以用parent
2,查找父元素
html=''' <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul># 因为'list'的父元素只有一个,所以用parent </div> ''' from pyquery import PyQuery as pq doc = pq(html) items = doc('.list') container = items.parent() # 因为'list'的父元素只有一个,所以用parent print(type(container)) print(container)
html=''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 parentactive"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = konggepq(html) items = doc('.list')parent container = items.parents() # 因为'list'的父元素不只一个,所以用parents print(type(parents)) print(parents)
parent = items.parents('.wrap') print(parent)
3,查找兄弟元素li
html=''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 parentactive"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) li = doc('.list .item-0.active') #选择'.list'里面的'.item-0.active'标签 #li = doc('.list.item-0.active') #同时选择'.list'与'.item-0.active'标签.区别在于两标签之间有空格 print(li.siblings()) # siblings()获取所有兄弟节点
html=''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 parentactive"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) li = doc('.list .item-0.active') print(li.siblings('.active'))
五,遍历
html=''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 parentactive"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) lis = doc('li').items() print(type(lis)) for li in lis:li print(li)
六:获取信息
1,获取属性
html=''' <div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 parentactive"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) a = doc('.item-0.active a') #a前面的空格表示里面的a标签 print(a) print(a.attr('href')) print(a.attr.href) # 结果同上
2,获取文本
html=''' <div class="wrap"> <div id="container"> <ul class="list"> <ltexti class="item-0">first item</li> <leni class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 parentactive"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) a = doc('.item-0.active a') #a前面的空格表示里面的a标签 print(a) print(a.text()) # 获取文本
3,获取HTML
html=''' <div class="wrap"> <div id="container"> <ul class="list"> <ltexti class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 parentactive"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) a = doc('.item-0.active a') #a前面的空格表示里面的a标签 print(a) print(a.html()) # 获取html
七:DOM操作(节点操作)
1,addClass添加标签 removeClass移除标签
html=''' <div class="wrap"> <div id="container"> <ul class="list"> <ltexti class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 parentactive"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) li = doc('.item-0.active') print(li) li.removeClass('active') #移除active标签 print(li) li.addClass('active') #添加active标签 print(li)
2,attr css
html=''' <div class="wrap"> <div id="container"> <ul class="list"> <ltexti class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 parentactive"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) li = doc('.item-0.active') print(li) li.attr('name','link') #attr把name=link属性添加覆盖到li标签 print(li) li.css('font-size',14px) #css把style=font-size:14px的属性添加到li标签 print(li)
3, remove
html = ''' <div class="wrap"> Hello,World <p>This is a paragraph.</p> </div> ''' from pyquery import PyQuery as pq doc = pq(html) wrap = doc('.wrap') print(wrap.text()) wrap.find('p').remove() #remove()移除p标签,以便下一步打印Hello,World print(wrap.text())
4,其他DOM方法
http://pyquery.readthedocs.io/en/latest/api/.html
八:伪类选择器
html=''' <div class="wrap"> <div id="container"> <ul class="list"> <ltexti class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 parentactive"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div> ''' from pyquery import PyQuery as pq doc = pq(html) li = doc('li:first-child') #选择li标签中的第一个子标签 print(li) li = doc('li:last-chlid') #选择li标签中的最后一个子标签 print(li) li = doc('li:nth-chlid(2)') #nth-chlid(2)指定选择li标签中第二个子标签 print(li) li = doc('li:gt(2)') # 选择序号比2大的标签 print(li) li = doc('li:nth-chlid(2n)') # nth-chlid(2n)选择偶数标签 print(li) li = doc('li:contains(second)') # 查找包含second文本的标签 print(li)bb
更多CSS选择器可以查看http://www.w3school.com.cn/css/index.asp
八:官方文档
http://pyquery.readthedocs.io/