爬虫学习-第十一篇

"""
Version: 0.1
Author: freshbin
Date: 2019年8月28日
"""

print("=================================pyquey使用 start================================================")

# 初始化
html = '''
<div class="wrap"><p>123</p>
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="list item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
# from pyquery import PyQuery as pq

# doc = pq(html)
# print(doc('li'))

# URL初始化
# from pyquery import PyQuery as pq
# doc = pq(url='https://cuiqingcai.com')
# print(doc('title'))

# 文件初始化
# from pyquery import PyQuery as pq
# doc = pq(filename='demo.html')
# print(doc('li'))

# 基本CSS选择器
# from pyquery import PyQuery as pq
# doc = pq(html)
# print(doc('#container .list li'))
# print(type(doc('#container .list li')))

# 查找节点
# 查找子节点
# from pyquery import PyQuery as pq

# doc = pq(html)
# items = doc('.list')
# print(type(items))
# print(items)

# lis = items.find('li') # 查找所有子孙节点
# lis = items.children() # 查找子节点
# lis = items.children('.active') # 筛选出子节点中class为active的节点

# print(type(lis))
# print(lis)

# 父节点
# from pyquery import PyQuery as pq

# doc = pq(html)
# items = doc('.list')
# container = items.parent() # 直接父节点
# print(type(container))
# print(container)
# parents = items.parents('.wrap') # 祖先节点
# print(type(parents))
# print(parents)

# 兄弟节点
# from pyquery import PyQuery as pq
# doc = pq(html)
# li = doc('.list .item-0.active') # 不知道为什么,如果是.item-0与.active多一个空格,那么就会取不到选择器
# print(li.siblings('.active'))

# 遍历
# from pyquery import PyQuery as pq
# doc = pq(html)
# lis = doc('li').items()
# print(type(lis))
# for li in lis:
#     print(li, type(li))

# 获取信息
# from pyquery import PyQuery as pq
# doc = pq(html)
# a = doc('.item-0.active a')
# print(a, type(a))
# for item in a.items():
#     print(item.attr('href'))

# 获取文本
# from pyquery import PyQuery as pq
# doc = pq(html)
# a = doc('.item-0.active a')
# print(a)
# print(a.text()) # 返回所有text内容,所有节点取文本之后合并成一个字符串
# li = doc('.item-0.active')
# print(li)
# print(li.html()) # 返回第一个li节点的内部HTML文本,所以如果要获取所有节点,那么需要遍历

# 节点操作
# from pyquery import PyQuery as pq
# doc = pq(html)
# li = doc('.item-0.active')
# print(li)
# li.remove_class('active')
# print(li)
# li.add_class('active')
# print(li)

# attr、text和html
# from pyquery import PyQuery as pq
# doc = pq(html)
# li = doc('.item-0.active')
# print(li)
# li.attr('name', 'link')
# print(li)
# li.text('changed item')
# print(li)
# li.html('<span>changed item</span>')
# print(li)

# remove()
# from pyquery import PyQuery as pq
# doc = pq(html)
# wrap = doc('.wrap')
# wrap.find('p').remove()
# print(wrap.text())
# 更多方法见 http://pyquery.readthedocs.io/en/latest/api.html

# 伪类选择器
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('li:first-child')
print(li)
li = doc('li:lats-child')
print(li)
li = doc('li:nth-child(2)')
print(li)
li = doc('li:gt(2)')
print(li)
li = doc('li:nth-child(2n)')
print(li)
li = doc('li:contains(seconds)')
print(li)
# 更多关于CSS选择器的用法:http://www.w3school.com.cn/css/index.asp
# 更多pyquery的官方文档:http://pyquery.readthedocs.io

print("=================================pyquey使用 end================================================")

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值