bs4/xpath/pyquery

from bs4 import BeautifulSoup
html = '''
    <body>
<header id="header">
    <h3 id="name">小强也可爱</h3>
    <title>标题</title>
  <div class="sns">
    <a href="http://www.kaikeba.com/feed/" target="_blank" rel="nofollow" title="RSS"><i class="fa fa-rss" aria-hidden="true"></i></a>
        <a href="http://kaikeba.com/kaikeba" target="_blank" rel="nofollow" title="Weibo"><i class="fa fa-weibo" aria-hidden="true"></i></a>
                <a href="https://www.kaikeba.com/in/kaikeba" target="_blank" rel="nofollow" title="Linkedin"><i class="fa fa-linkedin" aria-hidden="true"></i></a>
                <a href="mailto:kaikeba@gmail.com" target="_blank" rel="nofollow" title="envelope"><i class="fa fa-envelope" aria-hidden="true"></i></i></a>
          </div>
  <div class="nav">
   <ul><li class="current-menu-item"><a href="http://www.kaikeba.com/">hello</a></li>
<li><a href="http://www.kaikeba.com/about-me/">word</a></li>
<li><a href="http://www.kaikeba.com/post-search/">nihao</a></li>
<li><a href="http://www.kaikeba.com/wp-login.php">kkb</a></li>
</ul>  </div>
</header>
</body>
'''
soup = BeautifulSoup(html,'lxml')
# 格式化输出 soup 对象的内容
# print(soup.prettify())

# 根据标签名获取整个标签(但是拿出的是第一个)
# print(soup.li)

# 获取标签的名字
# print(soup.title.name)

# 获取标签中的文本
# print(soup.title.string)

# 获取标签title的父标标签
# print(soup.title.parent.name)

# 获取li标签的子标签
# print(soup.li.contents)

# 获取便签的属性值的两种方式
# print(soup.li["class"])
# print(soup.li.attrs['class'])

# 使用select,css选择器
# print(soup.select('li'))
# 类名前加.,id名前加#
# print(soup.select('.current-menu-item'))

# 获取内容
# print(soup.select('.current-menu-item')[0].get_text())
# 获取属性值
# print(soup.select('.current-menu-item')[0].attrs['class'])

# 获取li标签下面的子标签
# print(soup.select('li > a')[1].get_text())

# 使用find和findall进行查找
# print(soup.find('li',attrs={'class':'current-menu-item'}))
# print(soup.find_all('li',attrs={"class":"current-menu-item"}))

from bs4 import BeautifulSoup
def get_info():
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
    }
    
    response = requests.get(url='https://www.mzitu.com/jiepai/',headers=headers)
    soup = BeautifulSoup(response.text,'lxml')
    
    result = soup.find_all('img',attrs={'class':'lazy'})

    for value in result:

        response = requests.get(url=value.attrs['data-original'],headers=headers)
    
        path_name = './'+str(value.attrs['data-original'])[-5:-8:-1]+'.png'
        
        with open(path_name,'wb') as f:
            f.write(response.content)
    
get_info()
from lxml import etree

data_str = """
        <div>
            <ul>
                 <li class="item-0"><a href="link1.html">first item</a></li>
                 <li class="item-1"><a href="link2.html">second item</a></li>
                 <li class="item-inactive"><a href="link3.html">third item</a></li>
                 <li class="item-1"><a href="link4.html">fourth item</a></li>
                 <li class="item-0"><a href="link5.html">fifth item</a>
             </ul>
         </div>
        """
# 注意: 该数据中缺少了一个li标签的闭合标签

# 利用etree.HTML可以将字符串或者bytes转化为Element python对象,这个对象具有Xpath的方法
html = etree.HTML(data_str)
# print(html)

# etree.tostring(html)可以自动修正HTML代码,补全了缺胳膊少腿的标签
# 使用为了观察修改以后的html样子,根据修改后的HTML去写Xpath
# result = etree.tostring(html)
# print(result.decode("utf-8"))

# 获取class =item-1 的 a标签的href属性
result = html.xpath('//li[@class="item-1"]/a/@href')
print(result)

# 获取class =item-1 的 a标签的文本值
result = html.xpath('//li[@class="item-1"]/a/text()')
print(result)
['link2.html', 'link4.html']
['second item', 'fourth item']
from lxml import etree
import requests
url = 'http://www.baidu.com/s?wd=python'
reponse = requests.get(url)
html = etree.HTML(reponse.text)
url = html.xpath('//*[@id="1"]/h3/a[1]/@href')
print(url)
from pyquery import PyQuery as pq
html = """
<html lang="en">
    <head>
        <title>PyQuery</title>
    </head>
    <body>
        <ul id="container">
            <li class="o1">MM</li>
            <li class="o2 active">MN<a class='o22'>fad</a></li>
            <li class="o3">GN</li>
        </ul>
    </body>
</html>
"""
from pyquery import PyQuery as pq

#用css选择器来实现,如果要选id前面加#,如果选class,前面加.,如果选标签名,什么也不加

doc = pq(html)
# 根据标签
# print(doc('title')) 
# print(doc('#container'))
# print(doc('.o1'))

# 组合标签
# print(doc('.o2.active'))#空格表示里面,没有空格表示整体
# print(doc('.o2 .o22'))#空格表示里面,没有空格表示整体

#利用find方法
# print(doc.find('li'))

# 也可以用.children()查找直接子元素
# container = doc.find('#container')
# print(container.children())


<li class="o1">MM</li>
            <li class="o2 active">MN<a class="o22">fad</a></li>
            <li class="o3">GN</li>

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值