BeautifulSoup4--bs4

bs4概念
如何学习?
是什么? 能够做什么? 学习这个技术有什么意义?
BeautifulSoup4 美丽的汤 4
百度的网站 和 豆瓣的网站 结构

lxml pyquery BeautifulSoup4...网页信息提取库

解析网页的时候  因为每个网页的结构都不一样 所以采用最适合解析网页的技术

我们只需要记住一些方法的使用就可以了 requests get() post ...

另外的一种学习方法
源码分析 它主要的都是一些导航查找 修改的方法


bs4 快速入门
pip install lxml
pip install bs4

import bs4   (不用它)
form bs4 import BeautifulSoup   (用它)

soup = BeautifulSoup(html_doc, 'lxml')

tap: 标签
NavigableString: 可导航的字符串
BeautifulSoup: soup 对象
Comment: 注释


遍历树.py

# @ Time : 2021/2/27 12:19
# @ Author : Ellen

from bs4 import BeautifulSoup

html_doc = """
 <html><head><title>The Dormouse's story</title></head>
 <body>
 <p class="title"><b>The Dormouse's story</b></p>

 <p class="story">Once upon a time there were three little sister and their names were
 <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>
 <p class="story">...</p>
 """

soup = BeautifulSoup(html_doc, 'lxml')

'''
string获取标签里的内容
strings 返回是一个生成器对象来获取多个标签内容
stripped_strings 和strings基本致 但是它可以把多余的空格去掉
'''
# title_tag = soup.title
# print(title_tag)
# print(title_teg.string)

# head_tag = soup.head
# print(head_tag.string)

# s = soup.strings
# for i in s:
#     print(i)

# s = soup.stripped_strings
# for i in s:
#     print(i)

'''
parent 直接获得父节点
parents获取所有的父节点
'''
title_tag = soup.title
# print(title_tag)
# print(title_tag.parent)
# print(soup.html.parent)

# a_tag = soup.a
# print(a_tag.parents)
# for p in a_tag.parents:
#     print(p)
#     print('-' * 50)

'''
next_sibling 下一个兄弟结点
previous_sibling 上一个兄弟结点
next_siblings  下一个所有兄弟结点
previous_siblings 上一个所有兄弟结点
'''

html = '<a><b>bbb</b><c>ccc</c></a>'
soup2 = BeautifulSoup(html, 'lxml')
# print(soup2)
# print(soup2.prettify())

b_tag = soup2.b
# print(b_tag)    # <b>bbb</b>
# print(b_tag.next_sibling)    # <c>ccc</c>
# print(b_tag.previous_sibling)  # None

a_tag = soup.find(id='link3')
# print(a_tag)
for x in a_tag.previous_siblings:
    print(x)

搜索树.py

# @ Time : 2021/2/27 18:55
# @ Author : Ellen

from bs4 import BeautifulSoup

html_doc = """
 <html><head><title>The Dormouse's story</title></head>
 <body>
 <p class="title"><b>The Dormouse's story</b></p>

 <p class="story">Once upon a time there were three little sister and their names were
 <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>
 <p class="story">...</p>
 """

soup = BeautifulSoup(html_doc, 'lxml')
# 此时此刻的这个"a"就代表的是字符串过滤器
# a_tag = soup.find('a')
# print(a_tag)

# 字符串过滤器
a_tags = soup.find_all('a')
# print(a_tags)

# 列表过滤器
# 我要找a标签和p标签
# print(soup.find_all(['p', 'a']))
print(soup.find_all(['title', 'b']))

find()和find_all()方法

# @ Time : 2021/2/27 19:04
# @ Author : Ellen

'''
find_all()方法以列表形式返回所有的搜索到的标签数据
find()方法返回搜索到的第一条数据
'''


from bs4 import BeautifulSoup

html = """
<table class="tablelist" cellpadding="0" cellspacing="0">
    <tbody>
        <tr class="h">
            <td class="l" width="374">职位名称</td>
            <td>职位类别</td>
            <td>人数</td>
            <td>地点</td>
            <td>发布时间</td>
        </tr>
        <tr class="even">
            <td class="l square"><a target="_blank" href="position_detail.php?id=33824&keywords=python&tid=87&lid=2218">22989-金融云区块链高级研发工程师(深圳)</a></td>
            <td>技术类</td>
            <td>1</td>
            <td>深圳</td>
            <td>2017-11-25</td>
        </tr>
        <tr class="odd">
            <td class="l square"><a target="_blank" href="position_detail.php?id=29938&keywords=python&tid=87&lid=2218">22989-金融云高级后台开发</a></td>
            <td>技术类</td>
            <td>2</td>
            <td>深圳</td>
            <td>2017-11-25</td>
        </tr>
        <tr class="even">
            <td class="l square"><a target="_blank" href="position_detail.php?id=31236&keywords=python&tid=87&lid=2218">SNG16-腾讯音乐运营开发工程师(深圳)</a></td>
            <td>技术类</td>
            <td>2</td>
            <td>深圳</td>
            <td>2017-11-25</td>
        </tr>
        <tr class="odd">
            <td class="l square"><a target="_blank" href="position_detail.php?id=31235&keywords=python&tid=87&lid=2218">SNG16-腾讯音乐业务运维工程师(深圳)</a></td>
            <td>技术类</td>
            <td>1</td>
            <td>深圳</td>
            <td>2017-11-25</td>
        </tr>
        <tr class="even">
            <td class="l square"><a target="_blank" href="position_detail.php?id=34531&keywords=python&tid=87&lid=2218">TEG03-高级研发工程师(深圳)</a></td>
            <td>技术类</td>
            <td>1</td>
            <td>深圳</td>
            <td>2017-11-24</td>
        </tr>
        <tr class="odd">
            <td class="l square"><a target="_blank" href="position_detail.php?id=34532&keywords=python&tid=87&lid=2218">TEG03-高级图像算法研发工程师(深圳)</a></td>
            <td>技术类</td>
            <td>1</td>
            <td>深圳</td>
            <td>2017-11-24</td>
        </tr>
        <tr class="even">
            <td class="l square"><a target="_blank" href="position_detail.php?id=31648&keywords=python&tid=87&lid=2218">TEG11-高级AI开发工程师(深圳)</a></td>
            <td>技术类</td>
            <td>4</td>
            <td>深圳</td>
            <td>2017-11-24</td>
        </tr>
        <tr class="odd">
            <td class="l square"><a target="_blank" href="position_detail.php?id=32218&keywords=python&tid=87&lid=2218">15851-后台开发工程师</a></td>
            <td>技术类</td>
            <td>1</td>
            <td>深圳</td>
            <td>2017-11-24</td>
        </tr>
        <tr class="even">
            <td class="l square"><a target="_blank" href="position_detail.php?id=32217&keywords=python&tid=87&lid=2218">15851-后台开发工程师</a></td>
            <td>技术类</td>
            <td>1</td>
            <td>深圳</td>
            <td>2017-11-24</td>
        </tr>
        <tr class="odd">
            <td class="l square"><a id="test" class="test" target='_blank' href="position_detail.php?id=34511&keywords=python&tid=87&lid=2218">SNG11-高级业务运维工程师(深圳)</a></td>
            <td>技术类</td>
            <td>1</td>
            <td>深圳</td>
            <td>2017-11-24</td>
        </tr>
    </tbody>
</table>
"""

soup = BeautifulSoup(html, 'lxml')
# 1. 获取所有的tr标签
# print(soup.tr)
# print(soup.find('tr'))
# trs = soup.find_all('tr')
# for tr in trs:
    # print(tr)
    # print("-" * 50)

# 获取第二个tr标签
# tr = soup.find_all('tr', limit=2)[1]
# print(tr)

# 3 获取所有class等于even的tr标签 class是python关键字所以要加上一个下划线_
# trs = soup.find_all('tr', class_='even')
# for tr in trs:
#     print(tr)
#     print('-'*50)

# trs = soup.find_all('tr', attrs={'class': 'odd'})
# for tr in trs:
#     print(tr)
#     print('-'*50)

# 4. 将所有id=test class=test的a标签提取出来
# r = soup.find_all('a', id='test', class_='test')
# for a in r:
#     print(a)

# 5 获取所有a标签里面的href属性
# a = soup.find_all('a')
# for i in a:
#     href = i['href']
#     print(href)
#
# a = soup.find_all('a')
# for i in a:
#     href = i.attrs['href']
#     print(href)

# 获取所有职位的信息(文本数据)
trs = soup.find_all('tr')[1:]
for tr in trs:
    tds = tr.find_all('td')
    job_name = tds[0].string
    print(job_name)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值