BeautifulSoup库(笔记)

最新推荐文章于 2021-03-25 19:37:46 发布

果、失

最新推荐文章于 2021-03-25 19:37:46 发布

阅读量147

点赞数

文章标签： python 爬虫

本文链接：https://blog.csdn.net/qq_50958709/article/details/112692436

版权

from bs4 import BeautifulSoup
html = '''
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
<g><!--zz,hhh--></g>
'''
#soup=BeautifulSoup(html,'lxml')    #这里  lxml 是解析器    也会补全html     解析器也有多种
#print(soup)
#print(soup.prettify())    #这种格式有缩进，跟便于阅读
#1.Tag对象       就是一个个标签的全部 相当于完全复制标签（自己的理解）
soup=BeautifulSoup(html,'lxml')
print(soup.title)           #soup.标签名           来获取标签         如果有多个标签  默认的是获取第一个
print(soup.p.name)          #soup.标签名.name      来获取标签名        如果标签名不存在就会报错
print(soup.p.attrs)         #soup.标签名.attrs     获取的是标签的属性   返回的是一个字典
print(soup.p['class'])
print(soup.p.get('class'))  #这两种方式 都可以获取p标签中class属性对应的值
soup.p['class']='new'       #可修改class属性对应的值
#soup.p.get('class')='new'  #这种方法就不行了
#2.NavigableString     获取标签中内容
print(soup.p.string)
#3.BeautifulSoup   表示的是一个文档的全部内容   相当于tag
#4.Comment   可以提取注释内容
print(soup.g.string)

#遍历文档树
from bs4 import BeautifulSoup
html = '''
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
<g><!--zz,hhh--></g>
'''
soup=BeautifulSoup(html,'lxml')
head=soup.head
print(head.contents)     #content  得到的是head下的所有子节点  且返回的是  列表
print(head.children)     #children 返回的所有子节点的迭代器   可用for循环遍历出内容
for i in head.children:
    print(i)

for string in soup.strings:      # strings  如果html中有多个字符串  可以这个提出来的  但是有空格这些东西
    print(string)
    #print(repr(string))   #可以把换行符这些打印出来
for string in soup.stripped_strings:     #stripped_strings  就可以去掉空格这些
    print(string)

#find 和 find_all 方法
from bs4 import BeautifulSoup
html = """
<table class="tablelist" cellpadding="0" cellspacing="0">
    <tbody>
        <tr class="h">
            <td class="l" width="374">职位名称</td>
            <td>职位类别</td>
            <td>人数</td>
            <td>地点</td>
            <td>发布时间</td>
        </tr>
        <tr class="even">
            <td class="l square"><a target="_blank" href="position_detail.php?id=33824&keywords=python&tid=87&lid=2218">22989-金融云区块链高级研发工程师（深圳）</a></td>
            <td>技术类</td>
            <td>1</td>
            <td>深圳</td>
            <td>2017-11-25</td>
        </tr>
        <tr class="odd">
            <td class="l square"><a target="_blank" href="position_detail.php?id=29938&keywords=python&tid=87&lid=2218">22989-金融云高级后台开发</a></td>
            <td>技术类</td>
            <td>2</td>
            <td>深圳</td>
            <td>2017-11-25</td>
        </tr>
        <tr class="even">
            <td class="l square"><a target="_blank" href="position_detail.php?id=31236&keywords=python&tid=87&lid=2218">SNG16-腾讯音乐运营开发工程师（深圳）</a></td>
            <td>技术类</td>
            <td>2</td>
            <td>深圳</td>
            <td>2017-11-25</td>
        </tr>
        <tr class="odd">
            <td class="l square"><a target="_blank" href="position_detail.php?id=31235&keywords=python&tid=87&lid=2218">SNG16-腾讯音乐业务运维工程师（深圳）</a></td>
            <td>技术类</td>
            <td>1</td>
            <td>深圳</td>
            <td>2017-11-25</td>
        </tr>
        <tr class="even">
            <td class="l square"><a target="_blank" href="position_detail.php?id=34531&keywords=python&tid=87&lid=2218">TEG03-高级研发工程师（深圳）</a></td>
            <td>技术类</td>
            <td>1</td>
            <td>深圳</td>
            <td>2017-11-24</td>
        </tr>
        <tr class="odd">
            <td class="l square"><a target="_blank" href="position_detail.php?id=34532&keywords=python&tid=87&lid=2218">TEG03-高级图像算法研发工程师（深圳）</a></td>
            <td>技术类</td>
            <td>1</td>
            <td>深圳</td>
            <td>2017-11-24</td>
        </tr>
        <tr class="even">
            <td class="l square"><a target="_blank" href="position_detail.php?id=31648&keywords=python&tid=87&lid=2218">TEG11-高级AI开发工程师（深圳）</a></td>
            <td>技术类</td>
            <td>4</td>
            <td>深圳</td>
            <td>2017-11-24</td>
        </tr>
        <tr class="odd">
            <td class="l square"><a target="_blank" href="position_detail.php?id=32218&keywords=python&tid=87&lid=2218">15851-后台开发工程师</a></td>
            <td>技术类</td>
            <td>1</td>
            <td>深圳</td>
            <td>2017-11-24</td>
        </tr>
        <tr class="even">
            <td class="l square"><a target="_blank" href="position_detail.php?id=32217&keywords=python&tid=87&lid=2218">15851-后台开发工程师</a></td>
            <td>技术类</td>
            <td>1</td>
            <td>深圳</td>
            <td>2017-11-24</td>
        </tr>
        <tr class="odd">
            <td class="l square"><a id="test" class="test" target='_blank' href="position_detail.php?id=34511&keywords=python&tid=87&lid=2218">SNG11-高级业务运维工程师（深圳）</a></td>
            <td>技术类</td>
            <td>1</td>
            <td>深圳</td>
            <td>2017-11-24</td>
        </tr>
    </tbody>
</table>
"""

soup = BeautifulSoup(html,'lxml')

# 1. 获取所有tr标签
#trs=soup.find_all('tr')
'''for tr in trs:
   print(tr)'''
# 2. 获取第2个tr标签
#print(trs[1])

# 3. 获取所有class等于even的tr标签
#trs=soup.find_all('tr',class_='even')      #这里  因为  class 是关键字 所以要加一个   _

# 4. 将所有id等于test，class也等于test的a标签提取出来。
#trs=soup.find_all('a',id='test',class_='test')

# 5. 获取所有a标签的href属性
'''trs=soup.find_all('a')      #先拿a标签，在取内容
for a in trs:
    href=a['href']
    print(href)'''
# 6. 获取所有的职位信息（纯文本）
trs = soup.find_all('tr')[1:]
lists = []
for tr in trs:
    info = {}
    # tds = tr.find_all('td')
    # name = tds[0].string
    # category = tds[1].string
    # info['name']=name
    # info['category']=category
    # infos = list(tr.stripped_strings)
    infos =tr.get_text()
    print(infos)

#select方法
from bs4 import BeautifulSoup

soup = BeautifulSoup(html,'lxml')
# （1）通过标签名查找：
print(soup.select('a'))          #返回的是一个列表
# （2）通过类名查找：
print(soup.select('.sister'))    #返回的是一个列表
# （3）通过id查找：
print(soup.select('#link1'))
# （4）组合查找：
print(soup.select('p #link1'))
# （5）通过属性查找：
print(soup.select('a[href="http://example.com/elsie"]'))
# （6）获取内容：
print(soup.select('title')[0].get_text())

soup = BeautifulSoup(html,'lxml')
# 1. 获取所有tr标签
print(soup.select('tr'))
# 2. 获取第2个tr标签
print(soup.select('tr')[1])
# 3. 获取所有class等于even的tr标签
print(soup.select('.even'))
# 4. 获取所有a标签的href属性
a=soup.select('a')
for b in a:
    href=b['href']
    print(href)
# 5. 获取所有的职位信息（纯文本）
trs = soup.select('tr')
for tr in trs:
    info = list(tr.stripped_strings)
    #list() 函数用于将元组、区间（range）等转换为列表
    # 如果没有list（）   类型为<class 'generator'>
    # generator是一个函数，返回一个对象（迭代器），我们可以对其进行迭代   因为内容很多不可能全部显示出来 就把内容装在一个盒子里面
    '''for i in info:
        print(i)''' #这里的list()函数就是得出 隐藏在generator中的内容 相当于这个循环函数