BeautifulSoup4的学习

最新推荐文章于 2022-08-17 16:27:06 发布

医生的托马斯

最新推荐文章于 2022-08-17 16:27:06 发布

阅读量139

点赞数

分类专栏：爬虫 Python

本文链接：https://blog.csdn.net/weixin_43838889/article/details/101773182

版权

Python 同时被 2 个专栏收录

52 篇文章 0 订阅

订阅专栏

爬虫

3 篇文章 0 订阅

订阅专栏

>>> from bs4 import BeautifulSoup

#用浏览器访问https://movie.douban.com/top250，保存成250.html文件
#将一段文档传入BeautifulSoup 的构造方法,就能得到一个文档的对象, 可以传入一段字符串或一个文件句柄.
>>> soup = BeautifulSoup(open("250.html",encoding='UTF-8'))

#输出tag为a的数据
>>> tag = soup.a
>>> tag
<a href="https://www.douban.com/doumail/" id="top-nav-doumail-link">豆邮</a>

#每个tag都有自己的名字,通过 .name 来获取
>>> tag.name
'a'

#获取tag的属性
>>> tag.attrs
{'id': 'top-nav-doumail-link', 'href': 'https://www.douban.com/doumail/'}

#获取属性的值
>>> tag["href"]
'https://www.douban.com/doumail/'
>>> tag["id"]
'top-nav-doumail-link'

#获取tag的文本
>>> tag.get_text()
'豆邮'

#获取tag的字符串
>>> tag.string
'豆邮'

#tag中包含的字符串不能编辑,但是可以被替换成其它的字符串,用 replace_with() 方法
>>> tag.string.replace_with("修改豆邮")
'豆邮'
>>> tag.string
'修改豆邮'

#格式化数据
>>> soup.a.prettify()
'<a href="https://www.douban.com/doumail/" id="top-nav-doumail-link">\n 修改豆邮\n</a>\n'

#遍历文档树

#获取head
>>> soup.head
<head><script async="" src="250_files/num"></script><script async="" src="250_files/num_002"></script>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="webkit" name="renderer"/>
<meta content="always" name="referrer"/>
<meta content="ok0wCgT20tBBgo9_zat2iAcimtN4Ftf5ccsh092Xeyw" name="google-site-verification"/>
<blockquote>
豆瓣电影 Top 250
</blockquote>
<meta content="cZdR4xxR7RxmM4zE" name="baidu-site-verification"/>
<meta content="no-cache" http-equiv="Pragma"/>
<meta content="Sun, 6 Mar 2005 01:00:00 GMT" http-equiv="Expires"/>
<link href="https://img3.doubanio.com/f/movie/d59b2715fdea4968a450ee5f6c95c7d7a2030065/pics/movie/apple-touch-icon.png" rel="apple-touch-icon"/>
<link href="250_files/douban.css" rel="stylesheet" type="text/css"/>
<link href="250_files/_all.css" rel="stylesheet" type="text/css"/>
<link href="250_files/init.css" rel="stylesheet"/>
<script async="" defer="defer" src="250_files/piwik.js" type="text/javascript"></script><script async="true" src="250_files/dDY0Zjl6NS9mL2FkanMvNGQ1MjFiYTY2ZGE0MjE4OTc4YmYyOWZhODVjZDA2ZDdm" type="text/javascript"></script><script type="text/javascript">var _head_start = new Date();</script>
<script src="250_files/jquery.js" type="text/javascript"></script>
<script src="250_files/douban.js" type="text/javascript"></script>
<script src="250_files/_all.js" type="text/javascript"></script>
<link href="250_files/top_movies.css" rel="stylesheet" type="text/css"/>
<script data-cfg-autoload="false" src="250_files/do.js" type="text/javascript"></script>
<script type="text/javascript">
    Do.ready(function(){
            $("#mine-selector input[type='checkbox']").click(function(){
                var val = $(this).is(":checked")?$(this).val():"";
                window.location.href = '/top250?filter=' + val;
            })
    })
</script>
<style type="text/css">
.site-nav-logo img{margin-bottom:0;}
</style>
<style type="text/css">img { max-width: 100%; }</style>
<script type="text/javascript"></script>
<link href="250_files/562925b5e3824700.css" rel="stylesheet"/>
<link href="https://img3.doubanio.com/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
<script async="true" src="250_files/ga.js"></script></head>

#获取title
>>> soup.title

#可以在文档树的tag中多次调用这个方法.下面的代码可以获取<head>标签中的第一个<link>标签:
>>> soup.head.link
<link href="https://img3.doubanio.com/f/movie/d59b2715fdea4968a450ee5f6c95c7d7a2030065/pics/movie/apple-touch-icon.png" rel="apple-touch-icon"/>

#获取所有<link>标签
>>> soup.find_all('link')
[<link href="https://img3.doubanio.com/f/movie/d59b2715fdea4968a450ee5f6c95c7d7a2030065/pics/movie/apple-touch-icon.png" rel="apple-touch-icon"/>, <link href="250_files/douban.css" rel="stylesheet" type="text/css"/>, <link href="250_files/_all.css" rel="stylesheet" type="text/css"/>, <link href="250_files/init.css" rel="stylesheet"/>, <link href="250_files/top_movies.css" rel="stylesheet" type="text/css"/>, <link href="250_files/562925b5e3824700.css" rel="stylesheet"/>, <link href="https://img3.doubanio.com/favicon.ico" rel="shortcut icon" type="image/x-icon"/>, <link href="250_files/bundle.css" rel="stylesheet" type="text/css"/>, <link href="250_files/bundle_002.css" rel="stylesheet" type="text/css"/>, <link href="https://movie.douban.com/top250?start=25&amp;filter=" rel="next"/>]


#获取<head>中的内容
>>> head_tag = soup.head
>>> head_tag
[<script async="" src="250_files/num"></script>, <script async="" src="250_files/num_002"></script>, '\n', <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>, '\n', <meta content="webkit" name="renderer"/>, '\n', <meta content="always" name="referrer"/>, '\n', <meta content="ok0wCgT20tBBgo9_zat2iAcimtN4Ftf5ccsh092Xeyw" name="google-site-verification"/>, '\n', <title>
豆瓣电影 Top 250
</title>, '\n', <meta content="cZdR4xxR7RxmM4zE" name="baidu-site-verification"/>, '\n', <meta content="no-cache" http-equiv="Pragma"/>, '\n', <meta content="Sun, 6 Mar 2005 01:00:00 GMT" http-equiv="Expires"/>, '\n', <link href="https://img3.doubanio.com/f/movie/d59b2715fdea4968a450ee5f6c95c7d7a2030065/pics/movie/apple-touch-icon.png" rel="apple-touch-icon"/>, '\n', <link href="250_files/douban.css" rel="stylesheet" type="text/css"/>, '\n', <link href="250_files/_all.css" rel="stylesheet" type="text/css"/>, '\n', <link href="250_files/init.css" rel="stylesheet"/>, '\n', <script async="" defer="defer" src="250_files/piwik.js" type="text/javascript"></script>, <script async="true" src="250_files/dDY0Zjl6NS9mL2FkanMvNGQ1MjFiYTY2ZGE0MjE4OTc4YmYyOWZhODVjZDA2ZDdm" type="text/javascript"></script>, <script type="text/javascript">var _head_start = new Date();</script>, '\n', <script src="250_files/jquery.js" type="text/javascript"></script>, '\n', <script src="250_files/douban.js" type="text/javascript"></script>, '\n', <script src="250_files/_all.js" type="text/javascript"></script>, '\n', <link href="250_files/top_movies.css" rel="stylesheet" type="text/css"/>, '\n', <script data-cfg-autoload="false" src="250_files/do.js" type="text/javascript"></script>, '\n', <script type="text/javascript">
    Do.ready(function(){
            $("#mine-selector input[type='checkbox']").click(function(){
                var val = $(this).is(":checked")?$(this).val():"";
                window.location.href = '/top250?filter=' + val;
            })
    })
</script>, '\n', <style type="text/css">
.site-nav-logo img{margin-bottom:0;}
</style>, '\n', <style type="text/css">img { max-width: 100%; }</style>, '\n', <script type="text/javascript"></script>, '\n', <link href="250_files/562925b5e3824700.css" rel="stylesheet"/>, '\n', <link href="https://img3.doubanio.com/favicon.ico" rel="shortcut icon" type="image/x-icon"/>, '\n', <script async="true" src="250_files/ga.js"></script>]


#.contents 属性可以将tag的子节点以列表的方式输出:
>>> head_tag.contents[0]
<script async="" src="250_files/num"></script>

#获取列表len
>>> len(head_tag.contents)
54
>>> len(head_tag)
54

#通过tag的 .children 生成器,可以对tag的子节点进行循环
>>> for child in head_tag.children:
...     print(child)
...


#正则表达式
>>> import re
#找出所有以b开头的标签,这表示<body>和<b>标签都应该被找到
>>> for tag in soup.find_all(re.compile("^b")):
...     print(tag.name)
...
body
br
br
br
br
br
br
br
br
br
br
br
br
br
br
br
br
br
br
br
br
br
br
br
br
br
>>>


#找出所有名字中包含”t”的标签
>>> for tag in soup.find_all(re.compile("t")):
...     print(tag.name)
...
html
script
script
meta
meta
meta
meta
title
meta
meta
meta
script
script
script
script
script
script
script
script
style
style
script
script
script
table
tbody
tr
td
tr
td
tr
td
tr
td
tr
td
table
tbody
tr
td
script
script
fieldset
input
input
input
script
script
input
script
script
script
script
>>>

#找到文档中所有<a>标签和<b>标签，find_all的返回值是list。
>>> soup.find_all(["a", "b"])

医生的托马斯

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
BeautifulSoup4的学习

>>> from bs4 import BeautifulSoup#用浏览器访问https://movie.douban.com/top250，保存成250.html文件#将一段文档传入BeautifulSoup 的构造方法,就能得到一个文档的对象, 可以传入一段字符串或一个文件句柄.>>> soup = BeautifulSoup(open("250....
复制链接

扫一扫

专栏目录