PyQuery库详解

通过这篇文章为大家介绍崔庆才老师对Python爬虫PyQuery库的讲解,包括基本原理及其理论知识点

本文代码较多,建议阅读时间10分钟,并且注重理论与实践相结合

觉得文章比较枯燥和用电脑观看的可以点击阅读原文即可跳转到CSDN网页


目录:

一、什么是PyQuery库?

二、安装

三、PyQuery库详解



一、什么是PyQuery库?

强大而灵活的网页解析库。如果你觉得正则写起来太麻烦,如果你觉得BeautifulSoup语法太难记,如果你熟悉jQuery的语法,那么PyQuery就是你的绝佳选择!!!


二、安装

pip install pyquery

三、PyQuery用法讲解

  1. 初始化(3种)


  2. #字符串初始化
    html = '''
    <div>
        <ul>
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a><>/li
            <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    '''
    
    from pyquery import PyQuery as pq
    
    doc = pq(html)
    
    print(doc('li'))#选择器实际上就是CSS选择器,即:选id就加“#”,选class前面加“.”
    
    
    #URL初始化
    doc1 = pq(url = "http://www.baidu.com")

    print(doc1("head")) #文件初始化 doc2 = pq(filename = "demo.html")#自己下载一个HTML文件 print(doc2('li'))
  3. 基本CSS选择器


  4. #CSS选择器
    html = '''
    <div id="container">
        <ul class="list">
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a><>/li
            <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    '''
    
    doc3 = pq(html)
    
    print(doc3("#container .list li"))#注意空格,空格代表嵌套关系
  5. 查找元素

    子元素

  6. #子元素(find)
    html = '''
    <div id="container">
        <ul class="list">
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a><>/li
            <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    '''
    
    from pyquery import PyQuery as pq
    
    doc = pq(html)
    
    items = doc(".list")#首先选中url标签
    
    print(type(items))
    
    print(items)
    
    lis = items.find('li')#实际上也是一个CSS选择器,将里面所有的li标签都打印出来;只要在它里面的标签都可以找到
    
    print(type(lis))
    
    print(lis)
    #查找直接子元素
    lis2 = items.children()
    
    print(type(lis2))
    
    
    print(lis2)
    
    lis3 = items.children('.active')

    print(lis3)

    父元素

  7. #父元素
    html = '''
    <div id="container">
        <ul class="list">
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
    '''
    
    from pyquery import PyQuery as pq
    
    doc = pq(html)
    
    items = doc(".list")#首先选中url标签
    #每个标签外面肯定只能套一个父元素
    container = items.parent()

    print(type(container))

    print(container)


  8. #父元素2
    html = '''
    <div class="wrap">
        <div id="container">
            <ul class="list">
                <li class="item-0">first item</li>
                <li class="item-1"><a href="link2.html">second item</a><>/li
                <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
                <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                <li class="item-0"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    </div>
    ''' from pyquery import PyQuery as pq doc = pq(html) items = doc(".list")#首先选中url标签 #将所有祖先节点返回 parents = items.parents()

    print(parents) print(type(parents))#打印出两个div #在其中进行搜索 doc = pq(html)

    items = doc(".list")

    parents1 = items.parents(".wrap")

    print(parents1)#通过筛选,只剩下一个div

    兄弟元素

  9. #兄弟元素
    html = '''
    <div class="wrap">
        <div id="container">
            <ul class="list">
                <li class="item-0">first item</li>
                <li class="item-1"><a href="link2.html">second item</a><>/li
                <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
                <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                <li class="item-0"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    </div>
    ''' from pyquery import PyQuery as pq doc = pq(html) li = doc('.list .item-0.active')#首先选class=“.list”,空格即使选择list里面的标签,再选class=“item-0”,并列active(实际就是一个整体)

    print(li.siblings())#获取所有的兄弟元素 #在向其中筛选 print(li.siblings('.active'))
  10. 遍历 


  11. #单个元素
    html = '''
    <div class="wrap">
        <div id="container">
            <ul class="list">
                <li class="item-0">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
                <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                <li class="item-0"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    </div>
    '''
    
    from pyquery import PyQuery as pq
    
    doc = pq(html)
    
    li = doc(".item-0.active")
    
    print(li)
    
    lis = doc('li').items()#多个元素,进行遍历,生成一个产生器

    print(type(lis))

    for li in lis:
        print(li)
  12. 获取信息


  13. #获取属性
    html = '''
    <div class="wrap">
        <div id="container">
            <ul class="list">
                <li class="item-0">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
                <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                <li class="item-0"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    </div>
    '''
    
    from pyquery import PyQuery as pq
    
    doc = pq(html)
    
    a = doc(".item-0.active a")#选择class同时为item-0和active,在选择class里面的啊标签,中间注意空格

    print(a)

    print(a.attr("href"))

    print(a.attr.href)#结果同上 #获取文本 print(a.text())#将上面的选中的class中包围的文字 #获取HTML a1 = doc(".item-0.active")

    print(a1.html())
  14. DOM操作


  15. #addClass,removeClass
    html = '''
    <div class="wrap">
        <div id="container">
            <ul class="list">
                <li class="item-0">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
                <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                <li class="item-0"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    </div>
    '''
    
    from pyquery import PyQuery as pq
    
    doc = pq(html)
    
    li = doc(".item-0.active")

    print(li)

    li.removeClass("active")#移除active

    print(li)

    li.addClass("active")#增加active

    print(li) #attr、css doc = pq(html) li = doc(".item-0.active") li.attr("name","link")#若存在,就会覆盖

    print(li)

    li.css("font-size","14px")#增加style属性

    print(li)


  16. #remove
    html1 = '''
    <div class="wrap">
        Hello,World
        <p>This is a paragraph.</p>
    </div>
    '''
    
    from pyquery import PyQuery as pq
    
    doc = pq(html1)
    
    wrap = doc(".wrap")
    
    print(wrap.text())
    
    wrap.find('p').remove()
    
    print(wrap.text())

    其他DOM方法: http://pythonhosted.org/pyquery/


  17. #伪类选择器
    html = '''
    <div class="wrap">
        <div id="container">
            <ul class="list">
                <li class="item-0">first item</li>
                <li class="item-1"><a href="link2.html">second item</a></li>
                <li class="item-0 active"><a href="link3.html"><span class="boid">third item</span></a></li>
                <li class="item-1 active"><a href="link4.html">fourth item</a></li>
                <li class="item-0"><a href="link5.html">fifth item</a></li>
            </ul>
        </div>
    </div>
    '''
    
    from pyquery import PyQuery as pq
    
    doc = pq(html)
    
    li = doc("li:first-child")#第一个
    
    print(li)            
    
    li1 = doc('li:last-child')#最后一个
    
    print(li1)        
    
    li2 = doc('li:nth-child(2)')#指定缩写顺序,第二个
    
    print(li2)
    
    li3 = doc("li:gt(2)")#大于2的
    
    print(li3)
    
    li4 = doc("li:nth-child(2n)")#偶数
    
    print(li4)
    
    li5 = doc("li:contains(second)")#内容包含second
    
    print(li5)

    更多CSS选择器可以查看:http://www.w3school.com.cn/css/index.asp


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值