BeautifulSoup4的学习

>>> from bs4 import BeautifulSoup

#用浏览器访问https://movie.douban.com/top250,保存成250.html文件
#将一段文档传入BeautifulSoup 的构造方法,就能得到一个文档的对象, 可以传入一段字符串或一个文件句柄.
>>> soup = BeautifulSoup(open("250.html",encoding='UTF-8'))

#输出tag为a的数据
>>> tag = soup.a
>>> tag
<a href="https://www.douban.com/doumail/" id="top-nav-doumail-link">豆邮</a>

#每个tag都有自己的名字,通过 .name 来获取
>>> tag.name
'a'

#获取tag的属性
>>> tag.attrs
{'id': 'top-nav-doumail-link', 'href': 'https://www.douban.com/doumail/'}

#获取属性的值
>>> tag["href"]
'https://www.douban.com/doumail/'
>>> tag["id"]
'top-nav-doumail-link'

#获取tag的文本
>>> tag.get_text()
'豆邮'

#获取tag的字符串
>>> tag.string
'豆邮'

#tag中包含的字符串不能编辑,但是可以被替换成其它的字符串,用 replace_with() 方法
>>> tag.string.replace_with("修改豆邮")
'豆邮'
>>> tag.string
'修改豆邮'

#格式化数据
>>> soup.a.prettify()
'<a href="https://www.douban.com/doumail/" id="top-nav-doumail-link">\n 修改豆邮\n</a>\n'

#遍历文档树

#获取head
>>> soup.head
<head><script async="" src="250_files/num"></script><script async="" src="250_files/num_002"></script>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="webkit" name="renderer"/>
<meta content="always" name="referrer"/>
<meta content="ok0wCgT20tBBgo9_zat2iAcimtN4Ftf5ccsh092Xeyw" name="google-site-verification"/>
<blockquote>
豆瓣电影 Top 250
</blockquote>
<meta content="cZdR4xxR7RxmM4zE" name="baidu-site-verification"/>
<meta content="no-cache" http-equiv="Pragma"/>
<meta content="Sun, 6 Mar 2005 01:00:00 GMT" http-equiv="Expires"/>
<link href="https://img3.doubanio.com/f/movie/d59b2715fdea4968a450ee5f6c95c7d7a2030065/pics/movie/apple-touch-icon.png" rel="apple-touch-icon"/>
<link href="250_files/douban.css" rel="stylesheet" type="text/css"/>
<link href="250_files/_all.css" rel="stylesheet" type="text/css"/>
<link href="250_files/init.css" rel="stylesheet"/>
<script async="" defer="defer" src="250_files/piwik.js" type="text/javascript"></script><script async="true" src="250_files/dDY0Zjl6NS9mL2FkanMvNGQ1MjFiYTY2ZGE0MjE4OTc4YmYyOWZhODVjZDA2ZDdm" type="text/javascript"></script><script type="text/javascript">var _head_start = new Date();</script>
<script src="250_files/jquery.js" type="text/javascript"></script>
<script src="250_files/douban.js" type="text/javascript"></script>
<script src="250_files/_all.js" type="text/javascript"></script>
<link href="250_files/top_movies.css" rel="stylesheet" type="text/css"/>
<script data-cfg-autoload="false" src="250_files/do.js" type="text/javascript"></script>
<script type="text/javascript">
    Do.ready(function(){
            $("#mine-selector input[type='checkbox']").click(function(){
                var val = $(this).is(":checked")?$(this).val():"";
                window.location.href = '/top250?filter=' + val;
            })
    })
</script>
<style type="text/css">
.site-nav-logo img{margin-bottom:0;}
</style>
<style type="text/css">img { max-width: 100%; }</style>
<script type="text/javascript"></script>
<link href="250_files/562925b5e3824700.css" rel="stylesheet"/>
<link href="https://img3.doubanio.com/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
<script async="true" src="250_files/ga.js"></script></head>

#获取title
>>> soup.title

#可以在文档树的tag中多次调用这个方法.下面的代码可以获取<head>标签中的第一个<link>标签:
>>> soup.head.link
<link href="https://img3.doubanio.com/f/movie/d59b2715fdea4968a450ee5f6c95c7d7a2030065/pics/movie/apple-touch-icon.png" rel="apple-touch-icon"/>

#获取所有<link>标签
>>> soup.find_all('link')
[<link href="https://img3.doubanio.com/f/movie/d59b2715fdea4968a450ee5f6c95c7d7a2030065/pics/movie/apple-touch-icon.png" rel="apple-touch-icon"/>, <link href="250_files/douban.css" rel="stylesheet" type="text/css"/>, <link href="250_files/_all.css" rel="stylesheet" type="text/css"/>, <link href="250_files/init.css" rel="stylesheet"/>, <link href="250_files/top_movies.css" rel="stylesheet" type="text/css"/>, <link href="250_files/562925b5e3824700.css" rel="stylesheet"/>, <link href="https://img3.doubanio.com/favicon.ico" rel="shortcut icon" type="image/x-icon"/>, <link href="250_files/bundle.css" rel="stylesheet" type="text/css"/>, <link href="250_files/bundle_002.css" rel="stylesheet" type="text/css"/>, <link href="https://movie.douban.com/top250?start=25&amp;filter=" rel="next"/>]


#获取<head>中的内容
>>> head_tag = soup.head
>>> head_tag
[<script async="" src="250_files/num"></script>, <script async="" src="250_files/num_002"></script>, '\n', <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>, '\n', <meta content="webkit" name="renderer"/>, '\n', <meta content="always" name="referrer"/>, '\n', <meta content="ok0wCgT20tBBgo9_zat2iAcimtN4Ftf5ccsh092Xeyw" name="google-site-verification"/>, '\n', <title>
豆瓣电影 Top 250
</title>, '\n', <meta content="cZdR4xxR7RxmM4zE" name="baidu-site-verification"/>, '\n', <meta content="no-cache" http-equiv="Pragma"/>, '\n', <meta content="Sun, 6 Mar 2005 01:00:00 GMT" http-equiv="Expires"/>, '\n', <link href="https://img3.doubanio.com/f/movie/d59b2715fdea4968a450ee5f6c95c7d7a2030065/pics/movie/apple-touch-icon.png" rel="apple-touch-icon"/>, '\n', <link href="250_files/douban.css" rel="stylesheet" type="text/css"/>, '\n', <link href="250_files/_all.css" rel="stylesheet" type="text/css"/>, '\n', <link href="250_files/init.css" rel="stylesheet"/>, '\n', <script async="" defer="defer" src="250_files/piwik.js" type="text/javascript"></script>, <script async="true" src="250_files/dDY0Zjl6NS9mL2FkanMvNGQ1MjFiYTY2ZGE0MjE4OTc4YmYyOWZhODVjZDA2ZDdm" type="text/javascript"></script>, <script type="text/javascript">var _head_start = new Date();</script>, '\n', <script src="250_files/jquery.js" type="text/javascript"></script>, '\n', <script src="250_files/douban.js" type="text/javascript"></script>, '\n', <script src="250_files/_all.js" type="text/javascript"></script>, '\n', <link href="250_files/top_movies.css" rel="stylesheet" type="text/css"/>, '\n', <script data-cfg-autoload="false" src="250_files/do.js" type="text/javascript"></script>, '\n', <script type="text/javascript">
    Do.ready(function(){
            $("#mine-selector input[type='checkbox']").click(function(){
                var val = $(this).is(":checked")?$(this).val():"";
                window.location.href = '/top250?filter=' + val;
            })
    })
</script>, '\n', <style type="text/css">
.site-nav-logo img{margin-bottom:0;}
</style>, '\n', <style type="text/css">img { max-width: 100%; }</style>, '\n', <script type="text/javascript"></script>, '\n', <link href="250_files/562925b5e3824700.css" rel="stylesheet"/>, '\n', <link href="https://img3.doubanio.com/favicon.ico" rel="shortcut icon" type="image/x-icon"/>, '\n', <script async="true" src="250_files/ga.js"></script>]


#.contents 属性可以将tag的子节点以列表的方式输出:
>>> head_tag.contents[0]
<script async="" src="250_files/num"></script>

#获取列表len
>>> len(head_tag.contents)
54
>>> len(head_tag)
54

#通过tag的 .children 生成器,可以对tag的子节点进行循环
>>> for child in head_tag.children:
...     print(child)
...


#正则表达式
>>> import re
#找出所有以b开头的标签,这表示<body>和<b>标签都应该被找到
>>> for tag in soup.find_all(re.compile("^b")):
...     print(tag.name)
...
body
br
br
br
br
br
br
br
br
br
br
br
br
br
br
br
br
br
br
br
br
br
br
br
br
br
>>>


#找出所有名字中包含”t”的标签
>>> for tag in soup.find_all(re.compile("t")):
...     print(tag.name)
...
html
script
script
meta
meta
meta
meta
title
meta
meta
meta
script
script
script
script
script
script
script
script
style
style
script
script
script
table
tbody
tr
td
tr
td
tr
td
tr
td
tr
td
table
tbody
tr
td
script
script
fieldset
input
input
input
script
script
input
script
script
script
script
>>>

#找到文档中所有<a>标签和<b>标签,find_all的返回值是list。
>>> soup.find_all(["a", "b"])
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值